xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 467446fbca37e95635331dc8e534c1ae283105d1)
14c1414c8SBarry Smith /*
24c1414c8SBarry Smith   This file provides high performance routines for the Inode format (compressed sparse row)
34c1414c8SBarry Smith   by taking advantage of rows with identical nonzero structure (I-nodes).
44c1414c8SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>
6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H)
7fb56d528SJed Brown   #include <xmmintrin.h>
8fb56d528SJed Brown #endif
94c1414c8SBarry Smith 
10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11d71ae5a4SJacob Faibussowitsch {
124c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
134c1414c8SBarry Smith   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
144c1414c8SBarry Smith 
154c1414c8SBarry Smith   PetscFunctionBegin;
16d0f46423SBarry Smith   n = A->cmap->n;
17d0f46423SBarry Smith   m = A->rmap->n;
1808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
194c1414c8SBarry Smith   ns_row = a->inode.size;
204c1414c8SBarry Smith 
214c1414c8SBarry Smith   min_mn = (m < n) ? m : n;
224c1414c8SBarry Smith   if (!ns) {
239371c9d4SSatish Balay     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
249371c9d4SSatish Balay       ;
259371c9d4SSatish Balay     for (; count + 1 < n; count++, i++)
269371c9d4SSatish Balay       ;
27ad540459SPierre Jolivet     if (count < n) i++;
284c1414c8SBarry Smith     *size = i;
293ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
304c1414c8SBarry Smith   }
319566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &ns_col));
324c1414c8SBarry Smith 
334c1414c8SBarry Smith   /* Use the same row structure wherever feasible. */
34ad540459SPierre Jolivet   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
354c1414c8SBarry Smith 
364c1414c8SBarry Smith   /* if m < n; pad up the remainder with inode_limit */
37ad540459SPierre Jolivet   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
38aaa8cc7dSPierre Jolivet   /* The last node is the odd ball. pad it up with the remaining rows; */
394c1414c8SBarry Smith   if (count < n) {
404c1414c8SBarry Smith     ns_col[i] = n - count;
414c1414c8SBarry Smith     i++;
424c1414c8SBarry Smith   } else if (count > n) {
434c1414c8SBarry Smith     /* Adjust for the over estimation */
444c1414c8SBarry Smith     ns_col[i - 1] += n - count;
454c1414c8SBarry Smith   }
464c1414c8SBarry Smith   *size = i;
474c1414c8SBarry Smith   *ns   = ns_col;
483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
494c1414c8SBarry Smith }
504c1414c8SBarry Smith 
514c1414c8SBarry Smith /*
524c1414c8SBarry Smith       This builds symmetric version of nonzero structure,
534c1414c8SBarry Smith */
54d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
55d71ae5a4SJacob Faibussowitsch {
564c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
578758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
588758e1faSBarry Smith   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
598758e1faSBarry Smith   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
604c1414c8SBarry Smith 
614c1414c8SBarry Smith   PetscFunctionBegin;
624c1414c8SBarry Smith   nslim_row = a->inode.node_count;
63d0f46423SBarry Smith   m         = A->rmap->n;
64d0f46423SBarry Smith   n         = A->cmap->n;
6508401ef6SPierre Jolivet   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
6608401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
674c1414c8SBarry Smith 
684c1414c8SBarry Smith   /* Use the row_inode as column_inode */
694c1414c8SBarry Smith   nslim_col = nslim_row;
704c1414c8SBarry Smith   ns_col    = ns_row;
714c1414c8SBarry Smith 
7235cb6cd3SPierre Jolivet   /* allocate space for reformatted inode structure */
739566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
744c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
754c1414c8SBarry Smith 
764c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
774c1414c8SBarry Smith     nsz = ns_col[i1];
782205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
794c1414c8SBarry Smith   }
804c1414c8SBarry Smith   /* allocate space for row pointers */
819566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
824c1414c8SBarry Smith   *iia = ia;
839566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
844c1414c8SBarry Smith 
854c1414c8SBarry Smith   /* determine the number of columns in each row */
864c1414c8SBarry Smith   ia[0] = oshift;
874c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
884c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
894c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
9083fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
914c1414c8SBarry Smith     col = *j++ + ishift;
924c1414c8SBarry Smith     i2  = tvc[col];
936aad120cSJose E. Roman     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
944c1414c8SBarry Smith       ia[i1 + 1]++;
954c1414c8SBarry Smith       ia[i2 + 1]++;
964c1414c8SBarry Smith       i2++; /* Start col of next node */
9790d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
984c1414c8SBarry Smith       i2 = tvc[col];
994c1414c8SBarry Smith     }
1004c1414c8SBarry Smith     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
1014c1414c8SBarry Smith   }
1024c1414c8SBarry Smith 
1034c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1044c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1054c1414c8SBarry Smith     row = ia[i1 - 1];
1064c1414c8SBarry Smith     ia[i1] += row;
1074c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1084c1414c8SBarry Smith   }
1094c1414c8SBarry Smith 
1104c1414c8SBarry Smith   /* allocate space for column pointers */
1114c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1134c1414c8SBarry Smith   *jja = ja;
1144c1414c8SBarry Smith 
1154c1414c8SBarry Smith   /* loop over lower triangular part putting into ja */
1164c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1174c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
1184c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
11983fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
1204c1414c8SBarry Smith     col = *j++ + ishift;
1214c1414c8SBarry Smith     i2  = tvc[col];
1224c1414c8SBarry Smith     while (i2 < i1 && j < jmax) {
1234c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
1244c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
1254c1414c8SBarry Smith       ++i2;
12690d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
1274c1414c8SBarry Smith       i2 = tvc[col];
1284c1414c8SBarry Smith     }
1294c1414c8SBarry Smith     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
1304c1414c8SBarry Smith   }
1319566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
1329566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
1333ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1344c1414c8SBarry Smith }
1354c1414c8SBarry Smith 
1364c1414c8SBarry Smith /*
1374c1414c8SBarry Smith       This builds nonsymmetric version of nonzero structure,
1384c1414c8SBarry Smith */
139d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
140d71ae5a4SJacob Faibussowitsch {
1414c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
1428758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
1438758e1faSBarry Smith   PetscInt       *tns, *tvc, nsz, i1, i2;
1448758e1faSBarry Smith   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
1454c1414c8SBarry Smith 
1464c1414c8SBarry Smith   PetscFunctionBegin;
14708401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1484c1414c8SBarry Smith   nslim_row = a->inode.node_count;
149d0f46423SBarry Smith   n         = A->cmap->n;
1504c1414c8SBarry Smith 
1514c1414c8SBarry Smith   /* Create The column_inode for this matrix */
1529566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
1534c1414c8SBarry Smith 
15435cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
1559566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
1564c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
1574c1414c8SBarry Smith 
1584c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
1594c1414c8SBarry Smith     nsz = ns_col[i1];
1602205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
1614c1414c8SBarry Smith   }
1624c1414c8SBarry Smith   /* allocate space for row pointers */
1639566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
1644c1414c8SBarry Smith   *iia = ia;
1659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
1664c1414c8SBarry Smith 
1674c1414c8SBarry Smith   /* determine the number of columns in each row */
1684c1414c8SBarry Smith   ia[0] = oshift;
1694c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1704c1414c8SBarry Smith     j  = aj + ai[row] + ishift;
17183fed2edSSatish Balay     nz = ai[row + 1] - ai[row];
17283fed2edSSatish Balay     if (!nz) continue; /* empty row */
1734c1414c8SBarry Smith     col = *j++ + ishift;
1744c1414c8SBarry Smith     i2  = tvc[col];
1756aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
1764c1414c8SBarry Smith       ia[i1 + 1]++;
1774c1414c8SBarry Smith       i2++; /* Start col of next node */
178a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
1794c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
1804c1414c8SBarry Smith     }
1814c1414c8SBarry Smith   }
1824c1414c8SBarry Smith 
1834c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1844c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1854c1414c8SBarry Smith     row = ia[i1 - 1];
1864c1414c8SBarry Smith     ia[i1] += row;
1874c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1884c1414c8SBarry Smith   }
1894c1414c8SBarry Smith 
1904c1414c8SBarry Smith   /* allocate space for column pointers */
1914c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1929566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1934c1414c8SBarry Smith   *jja = ja;
1944c1414c8SBarry Smith 
1954c1414c8SBarry Smith   /* loop over matrix putting into ja */
1964c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1974c1414c8SBarry Smith     j  = aj + ai[row] + ishift;
19883fed2edSSatish Balay     nz = ai[row + 1] - ai[row];
19983fed2edSSatish Balay     if (!nz) continue; /* empty row */
2004c1414c8SBarry Smith     col = *j++ + ishift;
2014c1414c8SBarry Smith     i2  = tvc[col];
2024c1414c8SBarry Smith     while (nz-- > 0) {
2034c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
2044c1414c8SBarry Smith       ++i2;
205a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2064c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2074c1414c8SBarry Smith     }
2084c1414c8SBarry Smith   }
2099566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
2109566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
2119566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
2123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2134c1414c8SBarry Smith }
2144c1414c8SBarry Smith 
215d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
216d71ae5a4SJacob Faibussowitsch {
2174c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2184c1414c8SBarry Smith 
2194c1414c8SBarry Smith   PetscFunctionBegin;
22050ba90b4SBarry Smith   if (n) *n = a->inode.node_count;
2213ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2228f7157efSSatish Balay   if (!blockcompressed) {
2239566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2248f7157efSSatish Balay   } else if (symmetric) {
2259566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
2264c1414c8SBarry Smith   } else {
2279566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
2284c1414c8SBarry Smith   }
2293ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2304c1414c8SBarry Smith }
2314c1414c8SBarry Smith 
232d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
233d71ae5a4SJacob Faibussowitsch {
2344c1414c8SBarry Smith   PetscFunctionBegin;
2353ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2368f7157efSSatish Balay 
2378f7157efSSatish Balay   if (!blockcompressed) {
2389566063dSJacob Faibussowitsch     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2398f7157efSSatish Balay   } else {
2409566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
2419566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
2428f7157efSSatish Balay   }
2433ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2444c1414c8SBarry Smith }
2454c1414c8SBarry Smith 
246d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247d71ae5a4SJacob Faibussowitsch {
2484c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2494c1414c8SBarry Smith   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
2504c1414c8SBarry Smith   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
2514c1414c8SBarry Smith 
2524c1414c8SBarry Smith   PetscFunctionBegin;
25308401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2544c1414c8SBarry Smith   nslim_row = a->inode.node_count;
255d0f46423SBarry Smith   n         = A->cmap->n;
2564c1414c8SBarry Smith 
2574c1414c8SBarry Smith   /* Create The column_inode for this matrix */
2589566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
2594c1414c8SBarry Smith 
26035cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
2619566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
2624c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
2634c1414c8SBarry Smith 
2644c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
2654c1414c8SBarry Smith     nsz = ns_col[i1];
2662205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
2674c1414c8SBarry Smith   }
2684c1414c8SBarry Smith   /* allocate space for column pointers */
2699566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
2704c1414c8SBarry Smith   *iia = ia;
2719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_col + 1, &work));
2724c1414c8SBarry Smith 
2734c1414c8SBarry Smith   /* determine the number of columns in each row */
2744c1414c8SBarry Smith   ia[0] = oshift;
2754c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
2764c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
2774c1414c8SBarry Smith     col = *j++ + ishift;
2784c1414c8SBarry Smith     i2  = tvc[col];
2794c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
2806aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
2814c1414c8SBarry Smith       /* ia[i1+1]++; */
2824c1414c8SBarry Smith       ia[i2 + 1]++;
2834c1414c8SBarry Smith       i2++;
284a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2854c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2864c1414c8SBarry Smith     }
2874c1414c8SBarry Smith   }
2884c1414c8SBarry Smith 
2894c1414c8SBarry Smith   /* shift ia[i] to point to next col */
2904c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_col + 1; i1++) {
2914c1414c8SBarry Smith     col = ia[i1 - 1];
2924c1414c8SBarry Smith     ia[i1] += col;
2934c1414c8SBarry Smith     work[i1 - 1] = col - oshift;
2944c1414c8SBarry Smith   }
2954c1414c8SBarry Smith 
2964c1414c8SBarry Smith   /* allocate space for column pointers */
2974c1414c8SBarry Smith   nz = ia[nslim_col] + (!ishift);
2989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
2994c1414c8SBarry Smith   *jja = ja;
3004c1414c8SBarry Smith 
3014c1414c8SBarry Smith   /* loop over matrix putting into ja */
3024c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
3034c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
3044c1414c8SBarry Smith     col = *j++ + ishift;
3054c1414c8SBarry Smith     i2  = tvc[col];
3064c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
3074c1414c8SBarry Smith     while (nz-- > 0) {
3084c1414c8SBarry Smith       /* ja[work[i1]++] = i2 + oshift; */
3094c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
3104c1414c8SBarry Smith       i2++;
311a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
3124c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
3134c1414c8SBarry Smith     }
3144c1414c8SBarry Smith   }
3159566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
3169566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
3179566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
3183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3194c1414c8SBarry Smith }
3204c1414c8SBarry Smith 
321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322d71ae5a4SJacob Faibussowitsch {
3234c1414c8SBarry Smith   PetscFunctionBegin;
3249566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, n, NULL));
3253ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3264c1414c8SBarry Smith 
3278f7157efSSatish Balay   if (!blockcompressed) {
3289566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3298f7157efSSatish Balay   } else if (symmetric) {
330a5b23f4aSJose E. Roman     /* Since the indices are symmetric it doesn't matter */
3319566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
3324c1414c8SBarry Smith   } else {
3339566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
3344c1414c8SBarry Smith   }
3353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3364c1414c8SBarry Smith }
3374c1414c8SBarry Smith 
338d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339d71ae5a4SJacob Faibussowitsch {
3404c1414c8SBarry Smith   PetscFunctionBegin;
3413ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3428f7157efSSatish Balay   if (!blockcompressed) {
3439566063dSJacob Faibussowitsch     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3448f7157efSSatish Balay   } else {
3459566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
3469566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
3478f7157efSSatish Balay   }
3483ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3494c1414c8SBarry Smith }
3504c1414c8SBarry Smith 
351d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
352d71ae5a4SJacob Faibussowitsch {
3534c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
3544c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
355d9fead3dSBarry Smith   PetscScalar       *y;
356dd6ea824SBarry Smith   const PetscScalar *x;
357dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
3588758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
3598758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
3604c1414c8SBarry Smith 
3614c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
3624c1414c8SBarry Smith   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
3634c1414c8SBarry Smith #endif
3644c1414c8SBarry Smith 
3654c1414c8SBarry Smith   PetscFunctionBegin;
36608401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3674c1414c8SBarry Smith   node_max = a->inode.node_count;
3684c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
3699566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3709566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
3714c1414c8SBarry Smith   idx = a->j;
3724c1414c8SBarry Smith   v1  = a->a;
3734c1414c8SBarry Smith   ii  = a->i;
3744c1414c8SBarry Smith 
3754c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
3764c1414c8SBarry Smith     nsz = ns[i];
3774c1414c8SBarry Smith     n   = ii[1] - ii[0];
37898c9bda7SSatish Balay     nonzerorow += (n > 0) * nsz;
3794c1414c8SBarry Smith     ii += nsz;
38050d8bf02SJed Brown     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
38150d8bf02SJed Brown     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
3824c1414c8SBarry Smith     sz = n;                                                                /* No of non zeros in this row */
3834c1414c8SBarry Smith                                                                            /* Switch on the size of Node */
3844c1414c8SBarry Smith     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
3854c1414c8SBarry Smith     case 1:
38675567043SBarry Smith       sum1 = 0.;
3874c1414c8SBarry Smith 
3884c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
3894c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
3904c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
3914c1414c8SBarry Smith         idx += 2;
3924c1414c8SBarry Smith         tmp0 = x[i1];
3934c1414c8SBarry Smith         tmp1 = x[i2];
3949371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
3959371c9d4SSatish Balay         v1 += 2;
3964c1414c8SBarry Smith       }
3974c1414c8SBarry Smith 
3984c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
3994c1414c8SBarry Smith         tmp0 = x[*idx++];
4004c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4014c1414c8SBarry Smith       }
4024c1414c8SBarry Smith       y[row++] = sum1;
4034c1414c8SBarry Smith       break;
4044c1414c8SBarry Smith     case 2:
40575567043SBarry Smith       sum1 = 0.;
40675567043SBarry Smith       sum2 = 0.;
4074c1414c8SBarry Smith       v2   = v1 + n;
4084c1414c8SBarry Smith 
4094c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4104c1414c8SBarry Smith         i1 = idx[0];
4114c1414c8SBarry Smith         i2 = idx[1];
4124c1414c8SBarry Smith         idx += 2;
4134c1414c8SBarry Smith         tmp0 = x[i1];
4144c1414c8SBarry Smith         tmp1 = x[i2];
4159371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4169371c9d4SSatish Balay         v1 += 2;
4179371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4189371c9d4SSatish Balay         v2 += 2;
4194c1414c8SBarry Smith       }
4204c1414c8SBarry Smith       if (n == sz - 1) {
4214c1414c8SBarry Smith         tmp0 = x[*idx++];
4224c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4234c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4244c1414c8SBarry Smith       }
4254c1414c8SBarry Smith       y[row++] = sum1;
4264c1414c8SBarry Smith       y[row++] = sum2;
4274c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
4284c1414c8SBarry Smith       idx += sz;
4294c1414c8SBarry Smith       break;
4304c1414c8SBarry Smith     case 3:
43175567043SBarry Smith       sum1 = 0.;
43275567043SBarry Smith       sum2 = 0.;
43375567043SBarry Smith       sum3 = 0.;
4344c1414c8SBarry Smith       v2   = v1 + n;
4354c1414c8SBarry Smith       v3   = v2 + n;
4364c1414c8SBarry Smith 
4374c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4384c1414c8SBarry Smith         i1 = idx[0];
4394c1414c8SBarry Smith         i2 = idx[1];
4404c1414c8SBarry Smith         idx += 2;
4414c1414c8SBarry Smith         tmp0 = x[i1];
4424c1414c8SBarry Smith         tmp1 = x[i2];
4439371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4449371c9d4SSatish Balay         v1 += 2;
4459371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4469371c9d4SSatish Balay         v2 += 2;
4479371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4489371c9d4SSatish Balay         v3 += 2;
4494c1414c8SBarry Smith       }
4504c1414c8SBarry Smith       if (n == sz - 1) {
4514c1414c8SBarry Smith         tmp0 = x[*idx++];
4524c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4534c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4544c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4554c1414c8SBarry Smith       }
4564c1414c8SBarry Smith       y[row++] = sum1;
4574c1414c8SBarry Smith       y[row++] = sum2;
4584c1414c8SBarry Smith       y[row++] = sum3;
4594c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
4604c1414c8SBarry Smith       idx += 2 * sz;
4614c1414c8SBarry Smith       break;
4624c1414c8SBarry Smith     case 4:
46375567043SBarry Smith       sum1 = 0.;
46475567043SBarry Smith       sum2 = 0.;
46575567043SBarry Smith       sum3 = 0.;
46675567043SBarry Smith       sum4 = 0.;
4674c1414c8SBarry Smith       v2   = v1 + n;
4684c1414c8SBarry Smith       v3   = v2 + n;
4694c1414c8SBarry Smith       v4   = v3 + n;
4704c1414c8SBarry Smith 
4714c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4724c1414c8SBarry Smith         i1 = idx[0];
4734c1414c8SBarry Smith         i2 = idx[1];
4744c1414c8SBarry Smith         idx += 2;
4754c1414c8SBarry Smith         tmp0 = x[i1];
4764c1414c8SBarry Smith         tmp1 = x[i2];
4779371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4789371c9d4SSatish Balay         v1 += 2;
4799371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4809371c9d4SSatish Balay         v2 += 2;
4819371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4829371c9d4SSatish Balay         v3 += 2;
4839371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
4849371c9d4SSatish Balay         v4 += 2;
4854c1414c8SBarry Smith       }
4864c1414c8SBarry Smith       if (n == sz - 1) {
4874c1414c8SBarry Smith         tmp0 = x[*idx++];
4884c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4894c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4904c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4914c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
4924c1414c8SBarry Smith       }
4934c1414c8SBarry Smith       y[row++] = sum1;
4944c1414c8SBarry Smith       y[row++] = sum2;
4954c1414c8SBarry Smith       y[row++] = sum3;
4964c1414c8SBarry Smith       y[row++] = sum4;
4974c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
4984c1414c8SBarry Smith       idx += 3 * sz;
4994c1414c8SBarry Smith       break;
5004c1414c8SBarry Smith     case 5:
50175567043SBarry Smith       sum1 = 0.;
50275567043SBarry Smith       sum2 = 0.;
50375567043SBarry Smith       sum3 = 0.;
50475567043SBarry Smith       sum4 = 0.;
50575567043SBarry Smith       sum5 = 0.;
5064c1414c8SBarry Smith       v2   = v1 + n;
5074c1414c8SBarry Smith       v3   = v2 + n;
5084c1414c8SBarry Smith       v4   = v3 + n;
5094c1414c8SBarry Smith       v5   = v4 + n;
5104c1414c8SBarry Smith 
5114c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5124c1414c8SBarry Smith         i1 = idx[0];
5134c1414c8SBarry Smith         i2 = idx[1];
5144c1414c8SBarry Smith         idx += 2;
5154c1414c8SBarry Smith         tmp0 = x[i1];
5164c1414c8SBarry Smith         tmp1 = x[i2];
5179371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5189371c9d4SSatish Balay         v1 += 2;
5199371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
5209371c9d4SSatish Balay         v2 += 2;
5219371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
5229371c9d4SSatish Balay         v3 += 2;
5239371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
5249371c9d4SSatish Balay         v4 += 2;
5259371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
5269371c9d4SSatish Balay         v5 += 2;
5274c1414c8SBarry Smith       }
5284c1414c8SBarry Smith       if (n == sz - 1) {
5294c1414c8SBarry Smith         tmp0 = x[*idx++];
5304c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
5314c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
5324c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
5334c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
5344c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
5354c1414c8SBarry Smith       }
5364c1414c8SBarry Smith       y[row++] = sum1;
5374c1414c8SBarry Smith       y[row++] = sum2;
5384c1414c8SBarry Smith       y[row++] = sum3;
5394c1414c8SBarry Smith       y[row++] = sum4;
5404c1414c8SBarry Smith       y[row++] = sum5;
5414c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
5424c1414c8SBarry Smith       idx += 4 * sz;
5434c1414c8SBarry Smith       break;
544d71ae5a4SJacob Faibussowitsch     default:
545d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
5464c1414c8SBarry Smith     }
5474c1414c8SBarry Smith   }
5489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
5509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
5513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5524c1414c8SBarry Smith }
5532ef1f0ffSBarry Smith 
5544108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */
555d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
556d71ae5a4SJacob Faibussowitsch {
5574c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
5584c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
5598758e1faSBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
5608758e1faSBarry Smith   const PetscScalar *x;
5618758e1faSBarry Smith   PetscScalar       *y, *z, *zt;
5628758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
5638758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
5644c1414c8SBarry Smith 
5654c1414c8SBarry Smith   PetscFunctionBegin;
56608401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
5674c1414c8SBarry Smith   node_max = a->inode.node_count;
5684c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
5692205254eSKarl Rupp 
5709566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5719566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
5724c1414c8SBarry Smith   zt = z;
5734c1414c8SBarry Smith 
5744c1414c8SBarry Smith   idx = a->j;
5754c1414c8SBarry Smith   v1  = a->a;
5764c1414c8SBarry Smith   ii  = a->i;
5774c1414c8SBarry Smith 
5784c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
5794c1414c8SBarry Smith     nsz = ns[i];
5804c1414c8SBarry Smith     n   = ii[1] - ii[0];
5814c1414c8SBarry Smith     ii += nsz;
5824c1414c8SBarry Smith     sz = n;        /* No of non zeros in this row */
5834c1414c8SBarry Smith                    /* Switch on the size of Node */
5844c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
5854c1414c8SBarry Smith     case 1:
5864c1414c8SBarry Smith       sum1 = *zt++;
5874c1414c8SBarry Smith 
5884c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5894c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
5904c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
5914c1414c8SBarry Smith         idx += 2;
5924c1414c8SBarry Smith         tmp0 = x[i1];
5934c1414c8SBarry Smith         tmp1 = x[i2];
5949371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5959371c9d4SSatish Balay         v1 += 2;
5964c1414c8SBarry Smith       }
5974c1414c8SBarry Smith 
5984c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
5994c1414c8SBarry Smith         tmp0 = x[*idx++];
6004c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6014c1414c8SBarry Smith       }
6024c1414c8SBarry Smith       y[row++] = sum1;
6034c1414c8SBarry Smith       break;
6044c1414c8SBarry Smith     case 2:
6054c1414c8SBarry Smith       sum1 = *zt++;
6064c1414c8SBarry Smith       sum2 = *zt++;
6074c1414c8SBarry Smith       v2   = v1 + n;
6084c1414c8SBarry Smith 
6094c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6104c1414c8SBarry Smith         i1 = idx[0];
6114c1414c8SBarry Smith         i2 = idx[1];
6124c1414c8SBarry Smith         idx += 2;
6134c1414c8SBarry Smith         tmp0 = x[i1];
6144c1414c8SBarry Smith         tmp1 = x[i2];
6159371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6169371c9d4SSatish Balay         v1 += 2;
6179371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6189371c9d4SSatish Balay         v2 += 2;
6194c1414c8SBarry Smith       }
6204c1414c8SBarry Smith       if (n == sz - 1) {
6214c1414c8SBarry Smith         tmp0 = x[*idx++];
6224c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6234c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6244c1414c8SBarry Smith       }
6254c1414c8SBarry Smith       y[row++] = sum1;
6264c1414c8SBarry Smith       y[row++] = sum2;
6274c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
6284c1414c8SBarry Smith       idx += sz;
6294c1414c8SBarry Smith       break;
6304c1414c8SBarry Smith     case 3:
6314c1414c8SBarry Smith       sum1 = *zt++;
6324c1414c8SBarry Smith       sum2 = *zt++;
6334c1414c8SBarry Smith       sum3 = *zt++;
6344c1414c8SBarry Smith       v2   = v1 + n;
6354c1414c8SBarry Smith       v3   = v2 + n;
6364c1414c8SBarry Smith 
6374c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6384c1414c8SBarry Smith         i1 = idx[0];
6394c1414c8SBarry Smith         i2 = idx[1];
6404c1414c8SBarry Smith         idx += 2;
6414c1414c8SBarry Smith         tmp0 = x[i1];
6424c1414c8SBarry Smith         tmp1 = x[i2];
6439371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6449371c9d4SSatish Balay         v1 += 2;
6459371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6469371c9d4SSatish Balay         v2 += 2;
6479371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6489371c9d4SSatish Balay         v3 += 2;
6494c1414c8SBarry Smith       }
6504c1414c8SBarry Smith       if (n == sz - 1) {
6514c1414c8SBarry Smith         tmp0 = x[*idx++];
6524c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6534c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6544c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6554c1414c8SBarry Smith       }
6564c1414c8SBarry Smith       y[row++] = sum1;
6574c1414c8SBarry Smith       y[row++] = sum2;
6584c1414c8SBarry Smith       y[row++] = sum3;
6594c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
6604c1414c8SBarry Smith       idx += 2 * sz;
6614c1414c8SBarry Smith       break;
6624c1414c8SBarry Smith     case 4:
6634c1414c8SBarry Smith       sum1 = *zt++;
6644c1414c8SBarry Smith       sum2 = *zt++;
6654c1414c8SBarry Smith       sum3 = *zt++;
6664c1414c8SBarry Smith       sum4 = *zt++;
6674c1414c8SBarry Smith       v2   = v1 + n;
6684c1414c8SBarry Smith       v3   = v2 + n;
6694c1414c8SBarry Smith       v4   = v3 + n;
6704c1414c8SBarry Smith 
6714c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6724c1414c8SBarry Smith         i1 = idx[0];
6734c1414c8SBarry Smith         i2 = idx[1];
6744c1414c8SBarry Smith         idx += 2;
6754c1414c8SBarry Smith         tmp0 = x[i1];
6764c1414c8SBarry Smith         tmp1 = x[i2];
6779371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6789371c9d4SSatish Balay         v1 += 2;
6799371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6809371c9d4SSatish Balay         v2 += 2;
6819371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6829371c9d4SSatish Balay         v3 += 2;
6839371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
6849371c9d4SSatish Balay         v4 += 2;
6854c1414c8SBarry Smith       }
6864c1414c8SBarry Smith       if (n == sz - 1) {
6874c1414c8SBarry Smith         tmp0 = x[*idx++];
6884c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6894c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6904c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6914c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
6924c1414c8SBarry Smith       }
6934c1414c8SBarry Smith       y[row++] = sum1;
6944c1414c8SBarry Smith       y[row++] = sum2;
6954c1414c8SBarry Smith       y[row++] = sum3;
6964c1414c8SBarry Smith       y[row++] = sum4;
6974c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
6984c1414c8SBarry Smith       idx += 3 * sz;
6994c1414c8SBarry Smith       break;
7004c1414c8SBarry Smith     case 5:
7014c1414c8SBarry Smith       sum1 = *zt++;
7024c1414c8SBarry Smith       sum2 = *zt++;
7034c1414c8SBarry Smith       sum3 = *zt++;
7044c1414c8SBarry Smith       sum4 = *zt++;
7054c1414c8SBarry Smith       sum5 = *zt++;
7064c1414c8SBarry Smith       v2   = v1 + n;
7074c1414c8SBarry Smith       v3   = v2 + n;
7084c1414c8SBarry Smith       v4   = v3 + n;
7094c1414c8SBarry Smith       v5   = v4 + n;
7104c1414c8SBarry Smith 
7114c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
7124c1414c8SBarry Smith         i1 = idx[0];
7134c1414c8SBarry Smith         i2 = idx[1];
7144c1414c8SBarry Smith         idx += 2;
7154c1414c8SBarry Smith         tmp0 = x[i1];
7164c1414c8SBarry Smith         tmp1 = x[i2];
7179371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
7189371c9d4SSatish Balay         v1 += 2;
7199371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
7209371c9d4SSatish Balay         v2 += 2;
7219371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
7229371c9d4SSatish Balay         v3 += 2;
7239371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
7249371c9d4SSatish Balay         v4 += 2;
7259371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
7269371c9d4SSatish Balay         v5 += 2;
7274c1414c8SBarry Smith       }
7284c1414c8SBarry Smith       if (n == sz - 1) {
7294c1414c8SBarry Smith         tmp0 = x[*idx++];
7304c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
7314c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
7324c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
7334c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
7344c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
7354c1414c8SBarry Smith       }
7364c1414c8SBarry Smith       y[row++] = sum1;
7374c1414c8SBarry Smith       y[row++] = sum2;
7384c1414c8SBarry Smith       y[row++] = sum3;
7394c1414c8SBarry Smith       y[row++] = sum4;
7404c1414c8SBarry Smith       y[row++] = sum5;
7414c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
7424c1414c8SBarry Smith       idx += 4 * sz;
7434c1414c8SBarry Smith       break;
744d71ae5a4SJacob Faibussowitsch     default:
745d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
7464c1414c8SBarry Smith     }
7474c1414c8SBarry Smith   }
7489566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7499566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
7509566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
7513ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
7524c1414c8SBarry Smith }
7534c1414c8SBarry Smith 
754ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
755d71ae5a4SJacob Faibussowitsch {
7564c1414c8SBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
7574c1414c8SBarry Smith   IS                 iscol = a->col, isrow = a->row;
7585d0c19d7SBarry Smith   const PetscInt    *r, *c, *rout, *cout;
7598758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n, nz;
7608758e1faSBarry Smith   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
7618758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
762d9fead3dSBarry Smith   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
763d9fead3dSBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5;
764dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
765dd6ea824SBarry Smith   const PetscScalar *b;
7664c1414c8SBarry Smith 
7674c1414c8SBarry Smith   PetscFunctionBegin;
76808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
7694c1414c8SBarry Smith   node_max = a->inode.node_count;
7704c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
7714c1414c8SBarry Smith 
7729566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7739566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
7744c1414c8SBarry Smith   tmp = a->solve_work;
7754c1414c8SBarry Smith 
7769371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7779371c9d4SSatish Balay   r = rout;
7789371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7799371c9d4SSatish Balay   c = cout + (n - 1);
7804c1414c8SBarry Smith 
7814c1414c8SBarry Smith   /* forward solve the lower triangular */
7824c1414c8SBarry Smith   tmps = tmp;
7834c1414c8SBarry Smith   aa   = a_a;
7844c1414c8SBarry Smith   aj   = a_j;
7854c1414c8SBarry Smith   ad   = a->diag;
7864c1414c8SBarry Smith 
7874c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
7884c1414c8SBarry Smith     nsz = ns[i];
7894c1414c8SBarry Smith     aii = ai[row];
7904c1414c8SBarry Smith     v1  = aa + aii;
7914c1414c8SBarry Smith     vi  = aj + aii;
7924c1414c8SBarry Smith     nz  = ad[row] - aii;
79326549573SJed Brown     if (i < node_max - 1) {
79426549573SJed Brown       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
79591c35059SPierre Jolivet       * but our indexing to determine its size could. */
79650d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
79726549573SJed Brown       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
79850d8bf02SJed Brown       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
79926549573SJed Brown       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
80026549573SJed Brown     }
8014c1414c8SBarry Smith 
8024c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
8034c1414c8SBarry Smith     case 1:
8044c1414c8SBarry Smith       sum1 = b[*r++];
8054c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8064c1414c8SBarry Smith         i0 = vi[0];
8074c1414c8SBarry Smith         i1 = vi[1];
8084c1414c8SBarry Smith         vi += 2;
8094c1414c8SBarry Smith         tmp0 = tmps[i0];
8104c1414c8SBarry Smith         tmp1 = tmps[i1];
8119371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8129371c9d4SSatish Balay         v1 += 2;
8134c1414c8SBarry Smith       }
8144c1414c8SBarry Smith       if (j == nz - 1) {
8154c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8164c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8174c1414c8SBarry Smith       }
8184c1414c8SBarry Smith       tmp[row++] = sum1;
8194c1414c8SBarry Smith       break;
8204c1414c8SBarry Smith     case 2:
8214c1414c8SBarry Smith       sum1 = b[*r++];
8224c1414c8SBarry Smith       sum2 = b[*r++];
8234c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8244c1414c8SBarry Smith 
8254c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8264c1414c8SBarry Smith         i0 = vi[0];
8274c1414c8SBarry Smith         i1 = vi[1];
8284c1414c8SBarry Smith         vi += 2;
8294c1414c8SBarry Smith         tmp0 = tmps[i0];
8304c1414c8SBarry Smith         tmp1 = tmps[i1];
8319371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8329371c9d4SSatish Balay         v1 += 2;
8339371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8349371c9d4SSatish Balay         v2 += 2;
8354c1414c8SBarry Smith       }
8364c1414c8SBarry Smith       if (j == nz - 1) {
8374c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8384c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8394c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8404c1414c8SBarry Smith       }
8414c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8424c1414c8SBarry Smith       tmp[row++] = sum1;
8434c1414c8SBarry Smith       tmp[row++] = sum2;
8444c1414c8SBarry Smith       break;
8454c1414c8SBarry Smith     case 3:
8464c1414c8SBarry Smith       sum1 = b[*r++];
8474c1414c8SBarry Smith       sum2 = b[*r++];
8484c1414c8SBarry Smith       sum3 = b[*r++];
8494c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8504c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8514c1414c8SBarry Smith 
8524c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8534c1414c8SBarry Smith         i0 = vi[0];
8544c1414c8SBarry Smith         i1 = vi[1];
8554c1414c8SBarry Smith         vi += 2;
8564c1414c8SBarry Smith         tmp0 = tmps[i0];
8574c1414c8SBarry Smith         tmp1 = tmps[i1];
8589371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8599371c9d4SSatish Balay         v1 += 2;
8609371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8619371c9d4SSatish Balay         v2 += 2;
8629371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8639371c9d4SSatish Balay         v3 += 2;
8644c1414c8SBarry Smith       }
8654c1414c8SBarry Smith       if (j == nz - 1) {
8664c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8674c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8684c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8694c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
8704c1414c8SBarry Smith       }
8714c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8724c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
8734c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
8742205254eSKarl Rupp 
8754c1414c8SBarry Smith       tmp[row++] = sum1;
8764c1414c8SBarry Smith       tmp[row++] = sum2;
8774c1414c8SBarry Smith       tmp[row++] = sum3;
8784c1414c8SBarry Smith       break;
8794c1414c8SBarry Smith 
8804c1414c8SBarry Smith     case 4:
8814c1414c8SBarry Smith       sum1 = b[*r++];
8824c1414c8SBarry Smith       sum2 = b[*r++];
8834c1414c8SBarry Smith       sum3 = b[*r++];
8844c1414c8SBarry Smith       sum4 = b[*r++];
8854c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8864c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8874c1414c8SBarry Smith       v4   = aa + ai[row + 3];
8884c1414c8SBarry Smith 
8894c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8904c1414c8SBarry Smith         i0 = vi[0];
8914c1414c8SBarry Smith         i1 = vi[1];
8924c1414c8SBarry Smith         vi += 2;
8934c1414c8SBarry Smith         tmp0 = tmps[i0];
8944c1414c8SBarry Smith         tmp1 = tmps[i1];
8959371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8969371c9d4SSatish Balay         v1 += 2;
8979371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8989371c9d4SSatish Balay         v2 += 2;
8999371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9009371c9d4SSatish Balay         v3 += 2;
9019371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9029371c9d4SSatish Balay         v4 += 2;
9034c1414c8SBarry Smith       }
9044c1414c8SBarry Smith       if (j == nz - 1) {
9054c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9064c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9074c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9084c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9094c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9104c1414c8SBarry Smith       }
9114c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9124c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9134c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9144c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9154c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9164c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9174c1414c8SBarry Smith 
9184c1414c8SBarry Smith       tmp[row++] = sum1;
9194c1414c8SBarry Smith       tmp[row++] = sum2;
9204c1414c8SBarry Smith       tmp[row++] = sum3;
9214c1414c8SBarry Smith       tmp[row++] = sum4;
9224c1414c8SBarry Smith       break;
9234c1414c8SBarry Smith     case 5:
9244c1414c8SBarry Smith       sum1 = b[*r++];
9254c1414c8SBarry Smith       sum2 = b[*r++];
9264c1414c8SBarry Smith       sum3 = b[*r++];
9274c1414c8SBarry Smith       sum4 = b[*r++];
9284c1414c8SBarry Smith       sum5 = b[*r++];
9294c1414c8SBarry Smith       v2   = aa + ai[row + 1];
9304c1414c8SBarry Smith       v3   = aa + ai[row + 2];
9314c1414c8SBarry Smith       v4   = aa + ai[row + 3];
9324c1414c8SBarry Smith       v5   = aa + ai[row + 4];
9334c1414c8SBarry Smith 
9344c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
9354c1414c8SBarry Smith         i0 = vi[0];
9364c1414c8SBarry Smith         i1 = vi[1];
9374c1414c8SBarry Smith         vi += 2;
9384c1414c8SBarry Smith         tmp0 = tmps[i0];
9394c1414c8SBarry Smith         tmp1 = tmps[i1];
9409371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9419371c9d4SSatish Balay         v1 += 2;
9429371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9439371c9d4SSatish Balay         v2 += 2;
9449371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9459371c9d4SSatish Balay         v3 += 2;
9469371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9479371c9d4SSatish Balay         v4 += 2;
9489371c9d4SSatish Balay         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
9499371c9d4SSatish Balay         v5 += 2;
9504c1414c8SBarry Smith       }
9514c1414c8SBarry Smith       if (j == nz - 1) {
9524c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9534c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9544c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9554c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9564c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9574c1414c8SBarry Smith         sum5 -= *v5++ * tmp0;
9584c1414c8SBarry Smith       }
9594c1414c8SBarry Smith 
9604c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9614c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9624c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9634c1414c8SBarry Smith       sum5 -= *v5++ * sum1;
9644c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9654c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9664c1414c8SBarry Smith       sum5 -= *v5++ * sum2;
9674c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9684c1414c8SBarry Smith       sum5 -= *v5++ * sum3;
9694c1414c8SBarry Smith       sum5 -= *v5++ * sum4;
9704c1414c8SBarry Smith 
9714c1414c8SBarry Smith       tmp[row++] = sum1;
9724c1414c8SBarry Smith       tmp[row++] = sum2;
9734c1414c8SBarry Smith       tmp[row++] = sum3;
9744c1414c8SBarry Smith       tmp[row++] = sum4;
9754c1414c8SBarry Smith       tmp[row++] = sum5;
9764c1414c8SBarry Smith       break;
977d71ae5a4SJacob Faibussowitsch     default:
978d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
9794c1414c8SBarry Smith     }
9804c1414c8SBarry Smith   }
9814c1414c8SBarry Smith   /* backward solve the upper triangular */
9824c1414c8SBarry Smith   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
9834c1414c8SBarry Smith     nsz = ns[i];
9844c1414c8SBarry Smith     aii = ai[row + 1] - 1;
9854c1414c8SBarry Smith     v1  = aa + aii;
9864c1414c8SBarry Smith     vi  = aj + aii;
9874c1414c8SBarry Smith     nz  = aii - ad[row];
9884c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
9894c1414c8SBarry Smith     case 1:
9904c1414c8SBarry Smith       sum1 = tmp[row];
9914c1414c8SBarry Smith 
9924c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
9934c1414c8SBarry Smith         vi -= 2;
9944c1414c8SBarry Smith         i0   = vi[2];
9954c1414c8SBarry Smith         i1   = vi[1];
9964c1414c8SBarry Smith         tmp0 = tmps[i0];
9974c1414c8SBarry Smith         tmp1 = tmps[i1];
9984c1414c8SBarry Smith         v1 -= 2;
9994c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10004c1414c8SBarry Smith       }
10014c1414c8SBarry Smith       if (j == 1) {
10024c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10034c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10044c1414c8SBarry Smith       }
10059371c9d4SSatish Balay       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10069371c9d4SSatish Balay       row--;
10074c1414c8SBarry Smith       break;
10084c1414c8SBarry Smith     case 2:
10094c1414c8SBarry Smith       sum1 = tmp[row];
10104c1414c8SBarry Smith       sum2 = tmp[row - 1];
10114c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10124c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10134c1414c8SBarry Smith         vi -= 2;
10144c1414c8SBarry Smith         i0   = vi[2];
10154c1414c8SBarry Smith         i1   = vi[1];
10164c1414c8SBarry Smith         tmp0 = tmps[i0];
10174c1414c8SBarry Smith         tmp1 = tmps[i1];
10184c1414c8SBarry Smith         v1 -= 2;
10194c1414c8SBarry Smith         v2 -= 2;
10204c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10214c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10224c1414c8SBarry Smith       }
10234c1414c8SBarry Smith       if (j == 1) {
10244c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10254c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10264c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10274c1414c8SBarry Smith       }
10284c1414c8SBarry Smith 
10299371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10309371c9d4SSatish Balay       row--;
10314c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10329371c9d4SSatish Balay       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10339371c9d4SSatish Balay       row--;
10344c1414c8SBarry Smith       break;
10354c1414c8SBarry Smith     case 3:
10364c1414c8SBarry Smith       sum1 = tmp[row];
10374c1414c8SBarry Smith       sum2 = tmp[row - 1];
10384c1414c8SBarry Smith       sum3 = tmp[row - 2];
10394c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10404c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10414c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10424c1414c8SBarry Smith         vi -= 2;
10434c1414c8SBarry Smith         i0   = vi[2];
10444c1414c8SBarry Smith         i1   = vi[1];
10454c1414c8SBarry Smith         tmp0 = tmps[i0];
10464c1414c8SBarry Smith         tmp1 = tmps[i1];
10474c1414c8SBarry Smith         v1 -= 2;
10484c1414c8SBarry Smith         v2 -= 2;
10494c1414c8SBarry Smith         v3 -= 2;
10504c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10514c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10524c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10534c1414c8SBarry Smith       }
10544c1414c8SBarry Smith       if (j == 1) {
10554c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10564c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10574c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10584c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
10594c1414c8SBarry Smith       }
10609371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10619371c9d4SSatish Balay       row--;
10624c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10634c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10649371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10659371c9d4SSatish Balay       row--;
10664c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10679371c9d4SSatish Balay       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
10689371c9d4SSatish Balay       row--;
10694c1414c8SBarry Smith 
10704c1414c8SBarry Smith       break;
10714c1414c8SBarry Smith     case 4:
10724c1414c8SBarry Smith       sum1 = tmp[row];
10734c1414c8SBarry Smith       sum2 = tmp[row - 1];
10744c1414c8SBarry Smith       sum3 = tmp[row - 2];
10754c1414c8SBarry Smith       sum4 = tmp[row - 3];
10764c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10774c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10784c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
10794c1414c8SBarry Smith 
10804c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10814c1414c8SBarry Smith         vi -= 2;
10824c1414c8SBarry Smith         i0   = vi[2];
10834c1414c8SBarry Smith         i1   = vi[1];
10844c1414c8SBarry Smith         tmp0 = tmps[i0];
10854c1414c8SBarry Smith         tmp1 = tmps[i1];
10864c1414c8SBarry Smith         v1 -= 2;
10874c1414c8SBarry Smith         v2 -= 2;
10884c1414c8SBarry Smith         v3 -= 2;
10894c1414c8SBarry Smith         v4 -= 2;
10904c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10914c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10924c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10934c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
10944c1414c8SBarry Smith       }
10954c1414c8SBarry Smith       if (j == 1) {
10964c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10974c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10984c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10994c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11004c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11014c1414c8SBarry Smith       }
11024c1414c8SBarry Smith 
11039371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11049371c9d4SSatish Balay       row--;
11054c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11064c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11074c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11089371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11099371c9d4SSatish Balay       row--;
11104c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11114c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11129371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11139371c9d4SSatish Balay       row--;
11144c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11159371c9d4SSatish Balay       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11169371c9d4SSatish Balay       row--;
11174c1414c8SBarry Smith       break;
11184c1414c8SBarry Smith     case 5:
11194c1414c8SBarry Smith       sum1 = tmp[row];
11204c1414c8SBarry Smith       sum2 = tmp[row - 1];
11214c1414c8SBarry Smith       sum3 = tmp[row - 2];
11224c1414c8SBarry Smith       sum4 = tmp[row - 3];
11234c1414c8SBarry Smith       sum5 = tmp[row - 4];
11244c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
11254c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
11264c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
11274c1414c8SBarry Smith       v5   = aa + ai[row - 3] - 1;
11284c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
11294c1414c8SBarry Smith         vi -= 2;
11304c1414c8SBarry Smith         i0   = vi[2];
11314c1414c8SBarry Smith         i1   = vi[1];
11324c1414c8SBarry Smith         tmp0 = tmps[i0];
11334c1414c8SBarry Smith         tmp1 = tmps[i1];
11344c1414c8SBarry Smith         v1 -= 2;
11354c1414c8SBarry Smith         v2 -= 2;
11364c1414c8SBarry Smith         v3 -= 2;
11374c1414c8SBarry Smith         v4 -= 2;
11384c1414c8SBarry Smith         v5 -= 2;
11394c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11404c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11414c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11424c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11434c1414c8SBarry Smith         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
11444c1414c8SBarry Smith       }
11454c1414c8SBarry Smith       if (j == 1) {
11464c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11474c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11484c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11494c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11504c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11514c1414c8SBarry Smith         sum5 -= *v5-- * tmp0;
11524c1414c8SBarry Smith       }
11534c1414c8SBarry Smith 
11549371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11559371c9d4SSatish Balay       row--;
11564c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11574c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11584c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11594c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11609371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11619371c9d4SSatish Balay       row--;
11624c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11634c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11644c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11659371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11669371c9d4SSatish Balay       row--;
11674c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11684c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11699371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11709371c9d4SSatish Balay       row--;
11714c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11729371c9d4SSatish Balay       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
11739371c9d4SSatish Balay       row--;
11744c1414c8SBarry Smith       break;
1175d71ae5a4SJacob Faibussowitsch     default:
1176d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
11774c1414c8SBarry Smith     }
11784c1414c8SBarry Smith   }
11799566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
11809566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
11819566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
11829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
11839566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
11843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
11854c1414c8SBarry Smith }
11864c1414c8SBarry Smith 
1187d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1188d71ae5a4SJacob Faibussowitsch {
118928f1b45aSHong Zhang   Mat              C = B;
119028f1b45aSHong Zhang   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
119128f1b45aSHong Zhang   IS               isrow = b->row, isicol = b->icol;
119228f1b45aSHong Zhang   const PetscInt  *r, *ic, *ics;
119328f1b45aSHong Zhang   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
119428f1b45aSHong Zhang   PetscInt         i, j, k, nz, nzL, row, *pj;
119528f1b45aSHong Zhang   const PetscInt  *ajtmp, *bjtmp;
11969877982aSShri Abhyankar   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
11979877982aSShri Abhyankar   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
119828f1b45aSHong Zhang   FactorShiftCtx   sctx;
11994f81c4b7SBarry Smith   const PetscInt  *ddiag;
120028f1b45aSHong Zhang   PetscReal        rs;
120128f1b45aSHong Zhang   MatScalar        d;
12024f81c4b7SBarry Smith   PetscInt         inod, nodesz, node_max, col;
12034f81c4b7SBarry Smith   const PetscInt  *ns;
120407b50cabSHong Zhang   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
12050e95ead3SHong Zhang 
120628f1b45aSHong Zhang   PetscFunctionBegin;
120728f1b45aSHong Zhang   /* MatPivotSetUp(): initialize shift context sctx */
12089566063dSJacob Faibussowitsch   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
120928f1b45aSHong Zhang 
1210f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
121128f1b45aSHong Zhang     ddiag          = a->diag;
121228f1b45aSHong Zhang     sctx.shift_top = info->zeropivot;
121328f1b45aSHong Zhang     for (i = 0; i < n; i++) {
121428f1b45aSHong Zhang       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
121528f1b45aSHong Zhang       d  = (aa)[ddiag[i]];
121628f1b45aSHong Zhang       rs = -PetscAbsScalar(d) - PetscRealPart(d);
121728f1b45aSHong Zhang       v  = aa + ai[i];
121828f1b45aSHong Zhang       nz = ai[i + 1] - ai[i];
12192205254eSKarl Rupp       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
122028f1b45aSHong Zhang       if (rs > sctx.shift_top) sctx.shift_top = rs;
122128f1b45aSHong Zhang     }
122228f1b45aSHong Zhang     sctx.shift_top *= 1.1;
122328f1b45aSHong Zhang     sctx.nshift_max = 5;
122428f1b45aSHong Zhang     sctx.shift_lo   = 0.;
122528f1b45aSHong Zhang     sctx.shift_hi   = 1.;
122628f1b45aSHong Zhang   }
122728f1b45aSHong Zhang 
12289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
12299566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
123068785679SHong Zhang 
12319566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
123228f1b45aSHong Zhang   ics = ic;
123328f1b45aSHong Zhang 
123428f1b45aSHong Zhang   node_max = a->inode.node_count;
123528f1b45aSHong Zhang   ns       = a->inode.size;
123628b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
123728f1b45aSHong Zhang 
12389877982aSShri Abhyankar   /* If max inode size > 4, split it into two inodes.*/
123968785679SHong Zhang   /* also map the inode sizes according to the ordering */
12409566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
124168785679SHong Zhang   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1242b1550197SShri Abhyankar     if (ns[i] > 4) {
1243048b5e81SShri Abhyankar       tmp_vec1[j] = 4;
124468785679SHong Zhang       ++j;
124568785679SHong Zhang       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
124668785679SHong Zhang     } else {
124768785679SHong Zhang       tmp_vec1[j] = ns[i];
124868785679SHong Zhang     }
124968785679SHong Zhang   }
125068785679SHong Zhang   /* Use the correct node_max */
125168785679SHong Zhang   node_max = j;
125268785679SHong Zhang 
125368785679SHong Zhang   /* Now reorder the inode info based on mat re-ordering info */
125468785679SHong Zhang   /* First create a row -> inode_size_array_index map */
12559566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &nsmap));
12569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
125768785679SHong Zhang   for (i = 0, row = 0; i < node_max; i++) {
125868785679SHong Zhang     nodesz = tmp_vec1[i];
1259ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
126068785679SHong Zhang   }
126168785679SHong Zhang   /* Using nsmap, create a reordered ns structure */
126268785679SHong Zhang   for (i = 0, j = 0; i < node_max; i++) {
126368785679SHong Zhang     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
126468785679SHong Zhang     tmp_vec2[i] = nodesz;
126568785679SHong Zhang     j += nodesz;
126668785679SHong Zhang   }
12679566063dSJacob Faibussowitsch   PetscCall(PetscFree(nsmap));
12689566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec1));
1269b89f182dSHong Zhang 
127068785679SHong Zhang   /* Now use the correct ns */
127168785679SHong Zhang   ns = tmp_vec2;
127268785679SHong Zhang 
127328f1b45aSHong Zhang   do {
127407b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
127528f1b45aSHong Zhang     /* Now loop over each block-row, and do the factorization */
127628f1b45aSHong Zhang     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
127728f1b45aSHong Zhang       nodesz = ns[inod];
127828f1b45aSHong Zhang 
127928f1b45aSHong Zhang       switch (nodesz) {
128028f1b45aSHong Zhang       case 1:
1281b89f182dSHong Zhang         /* zero rtmp1 */
128228f1b45aSHong Zhang         /* L part */
128328f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
128428f1b45aSHong Zhang         bjtmp = bj + bi[i];
1285b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
128628f1b45aSHong Zhang 
128728f1b45aSHong Zhang         /* U part */
128828f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
128928f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
1290b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
129128f1b45aSHong Zhang 
129228f1b45aSHong Zhang         /* load in initial (unfactored row) */
129328f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
129428f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
129528f1b45aSHong Zhang         v     = aa + ai[r[i]];
12962205254eSKarl Rupp         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
12972205254eSKarl Rupp 
129828f1b45aSHong Zhang         /* ZeropivotApply() */
1299b89f182dSHong Zhang         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
130028f1b45aSHong Zhang 
130128f1b45aSHong Zhang         /* elimination */
130228f1b45aSHong Zhang         bjtmp = bj + bi[i];
130328f1b45aSHong Zhang         row   = *bjtmp++;
130428f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
130528f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1306b89f182dSHong Zhang           pc = rtmp1 + row;
130728f1b45aSHong Zhang           if (*pc != 0.0) {
130828f1b45aSHong Zhang             pv   = b->a + bdiag[row];
1309b89f182dSHong Zhang             mul1 = *pc * (*pv);
1310b89f182dSHong Zhang             *pc  = mul1;
131128f1b45aSHong Zhang             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
131228f1b45aSHong Zhang             pv   = b->a + bdiag[row + 1] + 1;
131328f1b45aSHong Zhang             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1314b89f182dSHong Zhang             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
13159566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz));
131628f1b45aSHong Zhang           }
131728f1b45aSHong Zhang           row = *bjtmp++;
131828f1b45aSHong Zhang         }
131928f1b45aSHong Zhang 
132028f1b45aSHong Zhang         /* finished row so stick it into b->a */
132128f1b45aSHong Zhang         rs = 0.0;
132228f1b45aSHong Zhang         /* L part */
132328f1b45aSHong Zhang         pv = b->a + bi[i];
132428f1b45aSHong Zhang         pj = b->j + bi[i];
132528f1b45aSHong Zhang         nz = bi[i + 1] - bi[i];
132628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13279371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13289371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
132928f1b45aSHong Zhang         }
133028f1b45aSHong Zhang 
133128f1b45aSHong Zhang         /* U part */
133228f1b45aSHong Zhang         pv = b->a + bdiag[i + 1] + 1;
133328f1b45aSHong Zhang         pj = b->j + bdiag[i + 1] + 1;
133428f1b45aSHong Zhang         nz = bdiag[i] - bdiag[i + 1] - 1;
133528f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13369371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13379371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
133828f1b45aSHong Zhang         }
133928f1b45aSHong Zhang 
1340b89f182dSHong Zhang         /* Check zero pivot */
134128f1b45aSHong Zhang         sctx.rs = rs;
1342b89f182dSHong Zhang         sctx.pv = rtmp1[i];
13439566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
134407b50cabSHong Zhang         if (sctx.newshift) break;
134528f1b45aSHong Zhang 
1346a5b23f4aSJose E. Roman         /* Mark diagonal and invert diagonal for simpler triangular solves */
134728f1b45aSHong Zhang         pv  = b->a + bdiag[i];
1348b89f182dSHong Zhang         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
134928f1b45aSHong Zhang         break;
135028f1b45aSHong Zhang 
135128f1b45aSHong Zhang       case 2:
1352b89f182dSHong Zhang         /* zero rtmp1 and rtmp2 */
135328f1b45aSHong Zhang         /* L part */
135428f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
135528f1b45aSHong Zhang         bjtmp = bj + bi[i];
135628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
135768785679SHong Zhang           col        = bjtmp[j];
13589371c9d4SSatish Balay           rtmp1[col] = 0.0;
13599371c9d4SSatish Balay           rtmp2[col] = 0.0;
136028f1b45aSHong Zhang         }
136128f1b45aSHong Zhang 
136228f1b45aSHong Zhang         /* U part */
136328f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
136428f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
136528f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
136668785679SHong Zhang           col        = bjtmp[j];
13679371c9d4SSatish Balay           rtmp1[col] = 0.0;
13689371c9d4SSatish Balay           rtmp2[col] = 0.0;
136928f1b45aSHong Zhang         }
137028f1b45aSHong Zhang 
137128f1b45aSHong Zhang         /* load in initial (unfactored row) */
137228f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
137328f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
13749371c9d4SSatish Balay         v1    = aa + ai[r[i]];
13759371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
137628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
137768785679SHong Zhang           col        = ics[ajtmp[j]];
13789371c9d4SSatish Balay           rtmp1[col] = v1[j];
13799371c9d4SSatish Balay           rtmp2[col] = v2[j];
138028f1b45aSHong Zhang         }
138128f1b45aSHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
13829371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
13839371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
138428f1b45aSHong Zhang 
138528f1b45aSHong Zhang         /* elimination */
138628f1b45aSHong Zhang         bjtmp = bj + bi[i];
138728f1b45aSHong Zhang         row   = *bjtmp++; /* pivot row */
138828f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
138928f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1390b89f182dSHong Zhang           pc1 = rtmp1 + row;
1391b89f182dSHong Zhang           pc2 = rtmp2 + row;
139228f1b45aSHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0) {
139328f1b45aSHong Zhang             pv   = b->a + bdiag[row];
13949371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
13959371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
13969371c9d4SSatish Balay             *pc1 = mul1;
13979371c9d4SSatish Balay             *pc2 = mul2;
139828f1b45aSHong Zhang 
139928f1b45aSHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
140028f1b45aSHong Zhang             pv = b->a + bdiag[row + 1] + 1;
140128f1b45aSHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
140228f1b45aSHong Zhang             for (j = 0; j < nz; j++) {
140368785679SHong Zhang               col = pj[j];
1404b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1405b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
140628f1b45aSHong Zhang             }
14079566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz));
140828f1b45aSHong Zhang           }
140928f1b45aSHong Zhang           row = *bjtmp++;
141028f1b45aSHong Zhang         }
141128f1b45aSHong Zhang 
1412b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
141328f1b45aSHong Zhang         rs = 0.0;
141428f1b45aSHong Zhang         /* L part */
1415b89f182dSHong Zhang         pc1 = b->a + bi[i];
141628f1b45aSHong Zhang         pj  = b->j + bi[i];
141728f1b45aSHong Zhang         nz  = bi[i + 1] - bi[i];
141828f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
141968785679SHong Zhang           col    = pj[j];
14209371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14219371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
142228f1b45aSHong Zhang         }
142328f1b45aSHong Zhang         /* U part */
1424b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
142528f1b45aSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
14260e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
142728f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
142868785679SHong Zhang           col    = pj[j];
14299371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14309371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
143128f1b45aSHong Zhang         }
143228f1b45aSHong Zhang 
143328f1b45aSHong Zhang         sctx.rs = rs;
1434b89f182dSHong Zhang         sctx.pv = rtmp1[i];
14359566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
143607b50cabSHong Zhang         if (sctx.newshift) break;
1437b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1438b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1439b89f182dSHong Zhang 
1440b89f182dSHong Zhang         /* Now take care of diagonal 2x2 block. */
1441b89f182dSHong Zhang         pc2 = rtmp2 + i;
1442b89f182dSHong Zhang         if (*pc2 != 0.0) {
1443b89f182dSHong Zhang           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1444b89f182dSHong Zhang           *pc2 = mul1;                        /* insert L entry */
1445b89f182dSHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1446b89f182dSHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1447b89f182dSHong Zhang           for (j = 0; j < nz; j++) {
14489371c9d4SSatish Balay             col = pj[j];
14499371c9d4SSatish Balay             rtmp2[col] -= mul1 * rtmp1[col];
145028f1b45aSHong Zhang           }
14519566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1452b89f182dSHong Zhang         }
1453b89f182dSHong Zhang 
1454b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1455b89f182dSHong Zhang         rs = 0.0;
1456b89f182dSHong Zhang         /* L part */
1457b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1458b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1459b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1460b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1461b89f182dSHong Zhang           col    = pj[j];
14629371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14639371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1464b89f182dSHong Zhang         }
1465b89f182dSHong Zhang         /* U part */
1466b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
14670e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
14680e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1469b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1470b89f182dSHong Zhang           col    = pj[j];
14719371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14729371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1473b89f182dSHong Zhang         }
1474b89f182dSHong Zhang 
147528f1b45aSHong Zhang         sctx.rs = rs;
1476b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
14779566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
147807b50cabSHong Zhang         if (sctx.newshift) break;
147928f1b45aSHong Zhang         pc2  = b->a + bdiag[i + 1];
1480b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv;
148128f1b45aSHong Zhang         break;
1482b89f182dSHong Zhang 
148368785679SHong Zhang       case 3:
148468785679SHong Zhang         /* zero rtmp */
148568785679SHong Zhang         /* L part */
148668785679SHong Zhang         nz    = bi[i + 1] - bi[i];
148768785679SHong Zhang         bjtmp = bj + bi[i];
148868785679SHong Zhang         for (j = 0; j < nz; j++) {
148968785679SHong Zhang           col        = bjtmp[j];
14909371c9d4SSatish Balay           rtmp1[col] = 0.0;
14919371c9d4SSatish Balay           rtmp2[col] = 0.0;
14929371c9d4SSatish Balay           rtmp3[col] = 0.0;
149368785679SHong Zhang         }
149468785679SHong Zhang 
149568785679SHong Zhang         /* U part */
149668785679SHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
149768785679SHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
149868785679SHong Zhang         for (j = 0; j < nz; j++) {
149968785679SHong Zhang           col        = bjtmp[j];
15009371c9d4SSatish Balay           rtmp1[col] = 0.0;
15019371c9d4SSatish Balay           rtmp2[col] = 0.0;
15029371c9d4SSatish Balay           rtmp3[col] = 0.0;
150368785679SHong Zhang         }
150468785679SHong Zhang 
150568785679SHong Zhang         /* load in initial (unfactored row) */
150668785679SHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
150768785679SHong Zhang         ajtmp = aj + ai[r[i]];
15089371c9d4SSatish Balay         v1    = aa + ai[r[i]];
15099371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
15109371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
151168785679SHong Zhang         for (j = 0; j < nz; j++) {
151268785679SHong Zhang           col        = ics[ajtmp[j]];
15139371c9d4SSatish Balay           rtmp1[col] = v1[j];
15149371c9d4SSatish Balay           rtmp2[col] = v2[j];
15159371c9d4SSatish Balay           rtmp3[col] = v3[j];
151668785679SHong Zhang         }
151768785679SHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
15189371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
15199371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
15209371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
152168785679SHong Zhang 
152268785679SHong Zhang         /* elimination */
152368785679SHong Zhang         bjtmp = bj + bi[i];
152468785679SHong Zhang         row   = *bjtmp++; /* pivot row */
152568785679SHong Zhang         nzL   = bi[i + 1] - bi[i];
152668785679SHong Zhang         for (k = 0; k < nzL; k++) {
1527b89f182dSHong Zhang           pc1 = rtmp1 + row;
1528b89f182dSHong Zhang           pc2 = rtmp2 + row;
1529b89f182dSHong Zhang           pc3 = rtmp3 + row;
153068785679SHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
153168785679SHong Zhang             pv   = b->a + bdiag[row];
15329371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
15339371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
15349371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
15359371c9d4SSatish Balay             *pc1 = mul1;
15369371c9d4SSatish Balay             *pc2 = mul2;
15379371c9d4SSatish Balay             *pc3 = mul3;
153868785679SHong Zhang 
153968785679SHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
154068785679SHong Zhang             pv = b->a + bdiag[row + 1] + 1;
154168785679SHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
154268785679SHong Zhang             for (j = 0; j < nz; j++) {
154368785679SHong Zhang               col = pj[j];
1544b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1545b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
1546b89f182dSHong Zhang               rtmp3[col] -= mul3 * pv[j];
154768785679SHong Zhang             }
15489566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz));
154968785679SHong Zhang           }
155068785679SHong Zhang           row = *bjtmp++;
155168785679SHong Zhang         }
155268785679SHong Zhang 
1553b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
1554b89f182dSHong Zhang         rs = 0.0;
1555b89f182dSHong Zhang         /* L part */
1556b89f182dSHong Zhang         pc1 = b->a + bi[i];
1557b89f182dSHong Zhang         pj  = b->j + bi[i];
1558b89f182dSHong Zhang         nz  = bi[i + 1] - bi[i];
1559b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1560b89f182dSHong Zhang           col    = pj[j];
15619371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15629371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1563b89f182dSHong Zhang         }
1564b89f182dSHong Zhang         /* U part */
1565b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
1566b89f182dSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
15670e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1568b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1569b89f182dSHong Zhang           col    = pj[j];
15709371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15719371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1572b89f182dSHong Zhang         }
157368785679SHong Zhang 
1574b89f182dSHong Zhang         sctx.rs = rs;
1575b89f182dSHong Zhang         sctx.pv = rtmp1[i];
15769566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
157707b50cabSHong Zhang         if (sctx.newshift) break;
1578b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1579b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1580b89f182dSHong Zhang 
1581b89f182dSHong Zhang         /* Now take care of 1st column of diagonal 3x3 block. */
1582b89f182dSHong Zhang         pc2 = rtmp2 + i;
1583b89f182dSHong Zhang         pc3 = rtmp3 + i;
1584b89f182dSHong Zhang         if (*pc2 != 0.0 || *pc3 != 0.0) {
15859371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
15869371c9d4SSatish Balay           *pc2 = mul2;
15879371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
15889371c9d4SSatish Balay           *pc3 = mul3;
158968785679SHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
159068785679SHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
159168785679SHong Zhang           for (j = 0; j < nz; j++) {
159268785679SHong Zhang             col = pj[j];
1593b89f182dSHong Zhang             rtmp2[col] -= mul2 * rtmp1[col];
1594b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp1[col];
159568785679SHong Zhang           }
15969566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz));
159768785679SHong Zhang         }
159868785679SHong Zhang 
1599b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1600b89f182dSHong Zhang         rs = 0.0;
1601b89f182dSHong Zhang         /* L part */
1602b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1603b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1604b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1605b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1606b89f182dSHong Zhang           col    = pj[j];
16079371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16089371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1609b89f182dSHong Zhang         }
1610b89f182dSHong Zhang         /* U part */
1611b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
16120e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
16130e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1614b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1615b89f182dSHong Zhang           col    = pj[j];
16169371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16179371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1618b89f182dSHong Zhang         }
1619b89f182dSHong Zhang 
1620b89f182dSHong Zhang         sctx.rs = rs;
1621b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
16229566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
162307b50cabSHong Zhang         if (sctx.newshift) break;
1624b89f182dSHong Zhang         pc2  = b->a + bdiag[i + 1];
1625b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1626b89f182dSHong Zhang 
1627b89f182dSHong Zhang         /* Now take care of 2nd column of diagonal 3x3 block. */
1628b89f182dSHong Zhang         pc3 = rtmp3 + i + 1;
162968785679SHong Zhang         if (*pc3 != 0.0) {
16309371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
16319371c9d4SSatish Balay           *pc3 = mul3;
163268785679SHong Zhang           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
163368785679SHong Zhang           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
163468785679SHong Zhang           for (j = 0; j < nz; j++) {
163568785679SHong Zhang             col = pj[j];
1636b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp2[col];
163768785679SHong Zhang           }
16389566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
163968785679SHong Zhang         }
164068785679SHong Zhang 
1641b89f182dSHong Zhang         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
164268785679SHong Zhang         rs = 0.0;
164368785679SHong Zhang         /* L part */
1644b89f182dSHong Zhang         pc3 = b->a + bi[i + 2];
1645b89f182dSHong Zhang         pj  = b->j + bi[i + 2];
1646b89f182dSHong Zhang         nz  = bi[i + 3] - bi[i + 2];
164768785679SHong Zhang         for (j = 0; j < nz; j++) {
164868785679SHong Zhang           col    = pj[j];
16499371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16509371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
165168785679SHong Zhang         }
165268785679SHong Zhang         /* U part */
1653b89f182dSHong Zhang         pc3 = b->a + bdiag[i + 3] + 1;
16540e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 3] + 1;
16550e7a5c2bSHong Zhang         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
165668785679SHong Zhang         for (j = 0; j < nz; j++) {
165768785679SHong Zhang           col    = pj[j];
16589371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16599371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
166068785679SHong Zhang         }
166168785679SHong Zhang 
166268785679SHong Zhang         sctx.rs = rs;
1663b89f182dSHong Zhang         sctx.pv = rtmp3[i + 2];
16649566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
166507b50cabSHong Zhang         if (sctx.newshift) break;
166668785679SHong Zhang         pc3  = b->a + bdiag[i + 2];
1667b89f182dSHong Zhang         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
166868785679SHong Zhang         break;
16699877982aSShri Abhyankar       case 4:
16709877982aSShri Abhyankar         /* zero rtmp */
16719877982aSShri Abhyankar         /* L part */
16729877982aSShri Abhyankar         nz    = bi[i + 1] - bi[i];
16739877982aSShri Abhyankar         bjtmp = bj + bi[i];
16749877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16759877982aSShri Abhyankar           col        = bjtmp[j];
16769371c9d4SSatish Balay           rtmp1[col] = 0.0;
16779371c9d4SSatish Balay           rtmp2[col] = 0.0;
16789371c9d4SSatish Balay           rtmp3[col] = 0.0;
16799371c9d4SSatish Balay           rtmp4[col] = 0.0;
16809877982aSShri Abhyankar         }
16819877982aSShri Abhyankar 
16829877982aSShri Abhyankar         /* U part */
16839877982aSShri Abhyankar         nz    = bdiag[i] - bdiag[i + 1];
16849877982aSShri Abhyankar         bjtmp = bj + bdiag[i + 1] + 1;
16859877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16869877982aSShri Abhyankar           col        = bjtmp[j];
16879371c9d4SSatish Balay           rtmp1[col] = 0.0;
16889371c9d4SSatish Balay           rtmp2[col] = 0.0;
16899371c9d4SSatish Balay           rtmp3[col] = 0.0;
16909371c9d4SSatish Balay           rtmp4[col] = 0.0;
16919877982aSShri Abhyankar         }
16929877982aSShri Abhyankar 
16939877982aSShri Abhyankar         /* load in initial (unfactored row) */
16949877982aSShri Abhyankar         nz    = ai[r[i] + 1] - ai[r[i]];
16959877982aSShri Abhyankar         ajtmp = aj + ai[r[i]];
16969371c9d4SSatish Balay         v1    = aa + ai[r[i]];
16979371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
16989371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
16999371c9d4SSatish Balay         v4    = aa + ai[r[i] + 3];
17009877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17019877982aSShri Abhyankar           col        = ics[ajtmp[j]];
17029371c9d4SSatish Balay           rtmp1[col] = v1[j];
17039371c9d4SSatish Balay           rtmp2[col] = v2[j];
17049371c9d4SSatish Balay           rtmp3[col] = v3[j];
17059371c9d4SSatish Balay           rtmp4[col] = v4[j];
17069877982aSShri Abhyankar         }
17079877982aSShri Abhyankar         /* ZeropivotApply(): shift the diagonal of the matrix  */
17089371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
17099371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
17109371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
17119371c9d4SSatish Balay         rtmp4[i + 3] += sctx.shift_amount;
17129877982aSShri Abhyankar 
17139877982aSShri Abhyankar         /* elimination */
17149877982aSShri Abhyankar         bjtmp = bj + bi[i];
17159877982aSShri Abhyankar         row   = *bjtmp++; /* pivot row */
17169877982aSShri Abhyankar         nzL   = bi[i + 1] - bi[i];
17179877982aSShri Abhyankar         for (k = 0; k < nzL; k++) {
17189877982aSShri Abhyankar           pc1 = rtmp1 + row;
17199877982aSShri Abhyankar           pc2 = rtmp2 + row;
17209877982aSShri Abhyankar           pc3 = rtmp3 + row;
17219877982aSShri Abhyankar           pc4 = rtmp4 + row;
17229877982aSShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17239877982aSShri Abhyankar             pv   = b->a + bdiag[row];
17249371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
17259371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
17269371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
17279371c9d4SSatish Balay             mul4 = *pc4 * (*pv);
17289371c9d4SSatish Balay             *pc1 = mul1;
17299371c9d4SSatish Balay             *pc2 = mul2;
17309371c9d4SSatish Balay             *pc3 = mul3;
17319371c9d4SSatish Balay             *pc4 = mul4;
17329877982aSShri Abhyankar 
17339877982aSShri Abhyankar             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
17349877982aSShri Abhyankar             pv = b->a + bdiag[row + 1] + 1;
17359877982aSShri Abhyankar             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
17369877982aSShri Abhyankar             for (j = 0; j < nz; j++) {
17379877982aSShri Abhyankar               col = pj[j];
17389877982aSShri Abhyankar               rtmp1[col] -= mul1 * pv[j];
17399877982aSShri Abhyankar               rtmp2[col] -= mul2 * pv[j];
17409877982aSShri Abhyankar               rtmp3[col] -= mul3 * pv[j];
17419877982aSShri Abhyankar               rtmp4[col] -= mul4 * pv[j];
17429877982aSShri Abhyankar             }
17439566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(4 + 8.0 * nz));
17449877982aSShri Abhyankar           }
17459877982aSShri Abhyankar           row = *bjtmp++;
17469877982aSShri Abhyankar         }
17479877982aSShri Abhyankar 
17489877982aSShri Abhyankar         /* finished row i; check zero pivot, then stick row i into b->a */
17499877982aSShri Abhyankar         rs = 0.0;
17509877982aSShri Abhyankar         /* L part */
17519877982aSShri Abhyankar         pc1 = b->a + bi[i];
17529877982aSShri Abhyankar         pj  = b->j + bi[i];
17539877982aSShri Abhyankar         nz  = bi[i + 1] - bi[i];
17549877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17559877982aSShri Abhyankar           col    = pj[j];
17569371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17579371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17589877982aSShri Abhyankar         }
17599877982aSShri Abhyankar         /* U part */
17609877982aSShri Abhyankar         pc1 = b->a + bdiag[i + 1] + 1;
17619877982aSShri Abhyankar         pj  = b->j + bdiag[i + 1] + 1;
17629877982aSShri Abhyankar         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
17639877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17649877982aSShri Abhyankar           col    = pj[j];
17659371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17669371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17679877982aSShri Abhyankar         }
17689877982aSShri Abhyankar 
17699877982aSShri Abhyankar         sctx.rs = rs;
17709877982aSShri Abhyankar         sctx.pv = rtmp1[i];
17719566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
177207b50cabSHong Zhang         if (sctx.newshift) break;
17739877982aSShri Abhyankar         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
17749877982aSShri Abhyankar         *pc1 = 1.0 / sctx.pv;
17759877982aSShri Abhyankar 
17769877982aSShri Abhyankar         /* Now take care of 1st column of diagonal 4x4 block. */
17779877982aSShri Abhyankar         pc2 = rtmp2 + i;
17789877982aSShri Abhyankar         pc3 = rtmp3 + i;
17799877982aSShri Abhyankar         pc4 = rtmp4 + i;
17809877982aSShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17819371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
17829371c9d4SSatish Balay           *pc2 = mul2;
17839371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
17849371c9d4SSatish Balay           *pc3 = mul3;
17859371c9d4SSatish Balay           mul4 = (*pc4) * (*pc1);
17869371c9d4SSatish Balay           *pc4 = mul4;
17879877982aSShri Abhyankar           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
17889877982aSShri Abhyankar           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
17899877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
17909877982aSShri Abhyankar             col = pj[j];
17919877982aSShri Abhyankar             rtmp2[col] -= mul2 * rtmp1[col];
17929877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp1[col];
17939877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp1[col];
17949877982aSShri Abhyankar           }
17959566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(3 + 6.0 * nz));
17969877982aSShri Abhyankar         }
17979877982aSShri Abhyankar 
17989877982aSShri Abhyankar         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
17999877982aSShri Abhyankar         rs = 0.0;
18009877982aSShri Abhyankar         /* L part */
18019877982aSShri Abhyankar         pc2 = b->a + bi[i + 1];
18029877982aSShri Abhyankar         pj  = b->j + bi[i + 1];
18039877982aSShri Abhyankar         nz  = bi[i + 2] - bi[i + 1];
18049877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18059877982aSShri Abhyankar           col    = pj[j];
18069371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18079371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18089877982aSShri Abhyankar         }
18099877982aSShri Abhyankar         /* U part */
18109877982aSShri Abhyankar         pc2 = b->a + bdiag[i + 2] + 1;
18119877982aSShri Abhyankar         pj  = b->j + bdiag[i + 2] + 1;
18129877982aSShri Abhyankar         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
18139877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18149877982aSShri Abhyankar           col    = pj[j];
18159371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18169371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18179877982aSShri Abhyankar         }
18189877982aSShri Abhyankar 
18199877982aSShri Abhyankar         sctx.rs = rs;
18209877982aSShri Abhyankar         sctx.pv = rtmp2[i + 1];
18219566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
182207b50cabSHong Zhang         if (sctx.newshift) break;
18239877982aSShri Abhyankar         pc2  = b->a + bdiag[i + 1];
18249877982aSShri Abhyankar         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
18259877982aSShri Abhyankar 
18269877982aSShri Abhyankar         /* Now take care of 2nd column of diagonal 4x4 block. */
18279877982aSShri Abhyankar         pc3 = rtmp3 + i + 1;
18289877982aSShri Abhyankar         pc4 = rtmp4 + i + 1;
18299877982aSShri Abhyankar         if (*pc3 != 0.0 || *pc4 != 0.0) {
18309371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
18319371c9d4SSatish Balay           *pc3 = mul3;
18329371c9d4SSatish Balay           mul4 = (*pc4) * (*pc2);
18339371c9d4SSatish Balay           *pc4 = mul4;
18349877982aSShri Abhyankar           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
18359877982aSShri Abhyankar           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
18369877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18379877982aSShri Abhyankar             col = pj[j];
18389877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp2[col];
18399877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp2[col];
18409877982aSShri Abhyankar           }
18419566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(4.0 * nz));
18429877982aSShri Abhyankar         }
18439877982aSShri Abhyankar 
18449877982aSShri Abhyankar         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
18459877982aSShri Abhyankar         rs = 0.0;
18469877982aSShri Abhyankar         /* L part */
18479877982aSShri Abhyankar         pc3 = b->a + bi[i + 2];
18489877982aSShri Abhyankar         pj  = b->j + bi[i + 2];
18499877982aSShri Abhyankar         nz  = bi[i + 3] - bi[i + 2];
18509877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18519877982aSShri Abhyankar           col    = pj[j];
18529371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18539371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18549877982aSShri Abhyankar         }
18559877982aSShri Abhyankar         /* U part */
18569877982aSShri Abhyankar         pc3 = b->a + bdiag[i + 3] + 1;
18579877982aSShri Abhyankar         pj  = b->j + bdiag[i + 3] + 1;
18589877982aSShri Abhyankar         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
18599877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18609877982aSShri Abhyankar           col    = pj[j];
18619371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18629371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18639877982aSShri Abhyankar         }
18649877982aSShri Abhyankar 
18659877982aSShri Abhyankar         sctx.rs = rs;
18669877982aSShri Abhyankar         sctx.pv = rtmp3[i + 2];
18679566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
186807b50cabSHong Zhang         if (sctx.newshift) break;
18699877982aSShri Abhyankar         pc3  = b->a + bdiag[i + 2];
18709877982aSShri Abhyankar         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
18719877982aSShri Abhyankar 
18729877982aSShri Abhyankar         /* Now take care of 3rd column of diagonal 4x4 block. */
18739877982aSShri Abhyankar         pc4 = rtmp4 + i + 2;
18749877982aSShri Abhyankar         if (*pc4 != 0.0) {
18759371c9d4SSatish Balay           mul4 = (*pc4) * (*pc3);
18769371c9d4SSatish Balay           *pc4 = mul4;
18779877982aSShri Abhyankar           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
18789877982aSShri Abhyankar           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
18799877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18809877982aSShri Abhyankar             col = pj[j];
18819877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp3[col];
18829877982aSShri Abhyankar           }
18839566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
18849877982aSShri Abhyankar         }
18859877982aSShri Abhyankar 
18869877982aSShri Abhyankar         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
18879877982aSShri Abhyankar         rs = 0.0;
18889877982aSShri Abhyankar         /* L part */
18899877982aSShri Abhyankar         pc4 = b->a + bi[i + 3];
18909877982aSShri Abhyankar         pj  = b->j + bi[i + 3];
18919877982aSShri Abhyankar         nz  = bi[i + 4] - bi[i + 3];
18929877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18939877982aSShri Abhyankar           col    = pj[j];
18949371c9d4SSatish Balay           pc4[j] = rtmp4[col];
18959371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
18969877982aSShri Abhyankar         }
18979877982aSShri Abhyankar         /* U part */
18989877982aSShri Abhyankar         pc4 = b->a + bdiag[i + 4] + 1;
18999877982aSShri Abhyankar         pj  = b->j + bdiag[i + 4] + 1;
19009877982aSShri Abhyankar         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
19019877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19029877982aSShri Abhyankar           col    = pj[j];
19039371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19049371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19059877982aSShri Abhyankar         }
19069877982aSShri Abhyankar 
19079877982aSShri Abhyankar         sctx.rs = rs;
19089877982aSShri Abhyankar         sctx.pv = rtmp4[i + 3];
19099566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
191007b50cabSHong Zhang         if (sctx.newshift) break;
19119877982aSShri Abhyankar         pc4  = b->a + bdiag[i + 3];
19129877982aSShri Abhyankar         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
19139877982aSShri Abhyankar         break;
191468785679SHong Zhang 
1915d71ae5a4SJacob Faibussowitsch       default:
1916d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
191728f1b45aSHong Zhang       }
1918c2b86aeeSHong Zhang       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
191928f1b45aSHong Zhang       i += nodesz;              /* Update the row */
192068785679SHong Zhang     }
192128f1b45aSHong Zhang 
192228f1b45aSHong Zhang     /* MatPivotRefine() */
192307b50cabSHong Zhang     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
192428f1b45aSHong Zhang       /*
192528f1b45aSHong Zhang        * if no shift in this attempt & shifting & started shifting & can refine,
192628f1b45aSHong Zhang        * then try lower shift
192728f1b45aSHong Zhang        */
192828f1b45aSHong Zhang       sctx.shift_hi       = sctx.shift_fraction;
192928f1b45aSHong Zhang       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
193028f1b45aSHong Zhang       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
193107b50cabSHong Zhang       sctx.newshift       = PETSC_TRUE;
193228f1b45aSHong Zhang       sctx.nshift++;
193328f1b45aSHong Zhang     }
193407b50cabSHong Zhang   } while (sctx.newshift);
193528f1b45aSHong Zhang 
19369566063dSJacob Faibussowitsch   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
19379566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
19389566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
19399566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
194028f1b45aSHong Zhang 
1941abb87a52SBarry Smith   if (b->inode.size) {
1942abb87a52SBarry Smith     C->ops->solve = MatSolve_SeqAIJ_Inode;
1943abb87a52SBarry Smith   } else {
1944d3ac4fa3SBarry Smith     C->ops->solve = MatSolve_SeqAIJ;
1945abb87a52SBarry Smith   }
194628f1b45aSHong Zhang   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
194728f1b45aSHong Zhang   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
194828f1b45aSHong Zhang   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
194928f1b45aSHong Zhang   C->ops->matsolve          = MatMatSolve_SeqAIJ;
195028f1b45aSHong Zhang   C->assembled              = PETSC_TRUE;
195128f1b45aSHong Zhang   C->preallocated           = PETSC_TRUE;
19522205254eSKarl Rupp 
19539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
195428f1b45aSHong Zhang 
195528f1b45aSHong Zhang   /* MatShiftView(A,info,&sctx) */
195628f1b45aSHong Zhang   if (sctx.nshift) {
1957f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
19589566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1959f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
19609566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1961f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
19629566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
196328f1b45aSHong Zhang     }
196428f1b45aSHong Zhang   }
19653ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
196628f1b45aSHong Zhang }
1967628f99d7SShri Abhyankar 
1968ff6a9541SJacob Faibussowitsch #if 0
1969ff6a9541SJacob Faibussowitsch // unused
1970ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1971d71ae5a4SJacob Faibussowitsch {
1972628f99d7SShri Abhyankar   Mat              C = B;
1973628f99d7SShri Abhyankar   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1974628f99d7SShri Abhyankar   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1975628f99d7SShri Abhyankar   const PetscInt  *r, *ic, *c, *ics;
1976628f99d7SShri Abhyankar   PetscInt         n = A->rmap->n, *bi = b->i;
1977628f99d7SShri Abhyankar   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
19788758e1faSBarry Smith   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
19798758e1faSBarry Smith   PetscInt        *ai = a->i, *aj = a->j;
1980628f99d7SShri Abhyankar   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1981628f99d7SShri Abhyankar   PetscScalar      mul1, mul2, mul3, tmp;
1982628f99d7SShri Abhyankar   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1983628f99d7SShri Abhyankar   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1984628f99d7SShri Abhyankar   PetscReal        rs = 0.0;
1985628f99d7SShri Abhyankar   FactorShiftCtx   sctx;
1986628f99d7SShri Abhyankar 
1987628f99d7SShri Abhyankar   PetscFunctionBegin;
1988628f99d7SShri Abhyankar   sctx.shift_top      = 0;
1989628f99d7SShri Abhyankar   sctx.nshift_max     = 0;
1990628f99d7SShri Abhyankar   sctx.shift_lo       = 0;
1991628f99d7SShri Abhyankar   sctx.shift_hi       = 0;
1992628f99d7SShri Abhyankar   sctx.shift_fraction = 0;
1993628f99d7SShri Abhyankar 
1994628f99d7SShri Abhyankar   /* if both shift schemes are chosen by user, only use info->shiftpd */
1995f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
1996628f99d7SShri Abhyankar     sctx.shift_top = 0;
1997628f99d7SShri Abhyankar     for (i = 0; i < n; i++) {
1998628f99d7SShri Abhyankar       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
1999628f99d7SShri Abhyankar       rs    = 0.0;
2000628f99d7SShri Abhyankar       ajtmp = aj + ai[i];
2001628f99d7SShri Abhyankar       rtmp1 = aa + ai[i];
2002628f99d7SShri Abhyankar       nz    = ai[i + 1] - ai[i];
2003628f99d7SShri Abhyankar       for (j = 0; j < nz; j++) {
2004628f99d7SShri Abhyankar         if (*ajtmp != i) {
2005628f99d7SShri Abhyankar           rs += PetscAbsScalar(*rtmp1++);
2006628f99d7SShri Abhyankar         } else {
2007628f99d7SShri Abhyankar           rs -= PetscRealPart(*rtmp1++);
2008628f99d7SShri Abhyankar         }
2009628f99d7SShri Abhyankar         ajtmp++;
2010628f99d7SShri Abhyankar       }
2011628f99d7SShri Abhyankar       if (rs > sctx.shift_top) sctx.shift_top = rs;
2012628f99d7SShri Abhyankar     }
2013628f99d7SShri Abhyankar     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2014628f99d7SShri Abhyankar     sctx.shift_top *= 1.1;
2015628f99d7SShri Abhyankar     sctx.nshift_max = 5;
2016628f99d7SShri Abhyankar     sctx.shift_lo   = 0.;
2017628f99d7SShri Abhyankar     sctx.shift_hi   = 1.;
2018628f99d7SShri Abhyankar   }
2019628f99d7SShri Abhyankar   sctx.shift_amount = 0;
2020628f99d7SShri Abhyankar   sctx.nshift       = 0;
2021628f99d7SShri Abhyankar 
20229566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
20239566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &c));
20249566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
20259566063dSJacob Faibussowitsch   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2026628f99d7SShri Abhyankar   ics = ic;
2027628f99d7SShri Abhyankar 
2028628f99d7SShri Abhyankar   node_max = a->inode.node_count;
2029628f99d7SShri Abhyankar   ns       = a->inode.size;
203028b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2031628f99d7SShri Abhyankar 
2032628f99d7SShri Abhyankar   /* If max inode size > 3, split it into two inodes.*/
2033628f99d7SShri Abhyankar   /* also map the inode sizes according to the ordering */
20349566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2035628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2036628f99d7SShri Abhyankar     if (ns[i] > 3) {
2037628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2038628f99d7SShri Abhyankar       ++j;
2039628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2040628f99d7SShri Abhyankar     } else {
2041628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i];
2042628f99d7SShri Abhyankar     }
2043628f99d7SShri Abhyankar   }
2044628f99d7SShri Abhyankar   /* Use the correct node_max */
2045628f99d7SShri Abhyankar   node_max = j;
2046628f99d7SShri Abhyankar 
2047628f99d7SShri Abhyankar   /* Now reorder the inode info based on mat re-ordering info */
2048628f99d7SShri Abhyankar   /* First create a row -> inode_size_array_index map */
20499566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2050628f99d7SShri Abhyankar   for (i = 0, row = 0; i < node_max; i++) {
2051628f99d7SShri Abhyankar     nodesz = tmp_vec1[i];
2052ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2053628f99d7SShri Abhyankar   }
2054628f99d7SShri Abhyankar   /* Using nsmap, create a reordered ns structure */
2055628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; i++) {
2056628f99d7SShri Abhyankar     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2057628f99d7SShri Abhyankar     tmp_vec2[i] = nodesz;
2058628f99d7SShri Abhyankar     j += nodesz;
2059628f99d7SShri Abhyankar   }
20609566063dSJacob Faibussowitsch   PetscCall(PetscFree2(nsmap, tmp_vec1));
2061628f99d7SShri Abhyankar   /* Now use the correct ns */
2062628f99d7SShri Abhyankar   ns = tmp_vec2;
2063628f99d7SShri Abhyankar 
2064628f99d7SShri Abhyankar   do {
206507b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
2066628f99d7SShri Abhyankar     /* Now loop over each block-row, and do the factorization */
2067628f99d7SShri Abhyankar     for (i = 0, row = 0; i < node_max; i++) {
2068628f99d7SShri Abhyankar       nodesz = ns[i];
2069628f99d7SShri Abhyankar       nz     = bi[row + 1] - bi[row];
2070628f99d7SShri Abhyankar       bjtmp  = bj + bi[row];
2071628f99d7SShri Abhyankar 
2072628f99d7SShri Abhyankar       switch (nodesz) {
2073628f99d7SShri Abhyankar       case 1:
2074628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2075628f99d7SShri Abhyankar           idx         = bjtmp[j];
2076628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2077628f99d7SShri Abhyankar         }
2078628f99d7SShri Abhyankar 
2079628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2080628f99d7SShri Abhyankar         idx    = r[row];
2081628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2082628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2083628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2084628f99d7SShri Abhyankar 
2085628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2086628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2087628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2088628f99d7SShri Abhyankar         }
2089628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2090628f99d7SShri Abhyankar 
2091628f99d7SShri Abhyankar         prow = *bjtmp++;
2092628f99d7SShri Abhyankar         while (prow < row) {
2093628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2094628f99d7SShri Abhyankar           if (*pc1 != 0.0) {
2095628f99d7SShri Abhyankar             pv     = ba + bd[prow];
2096628f99d7SShri Abhyankar             pj     = nbj + bd[prow];
2097628f99d7SShri Abhyankar             mul1   = *pc1 * *pv++;
2098628f99d7SShri Abhyankar             *pc1   = mul1;
2099628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
21009566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2101628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2102628f99d7SShri Abhyankar               tmp = pv[j];
2103628f99d7SShri Abhyankar               idx = pj[j];
2104628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2105628f99d7SShri Abhyankar             }
2106628f99d7SShri Abhyankar           }
2107628f99d7SShri Abhyankar           prow = *bjtmp++;
2108628f99d7SShri Abhyankar         }
2109628f99d7SShri Abhyankar         pj  = bj + bi[row];
2110628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2111628f99d7SShri Abhyankar 
2112628f99d7SShri Abhyankar         sctx.pv     = rtmp11[row];
2113628f99d7SShri Abhyankar         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2114628f99d7SShri Abhyankar         rs          = 0.0;
2115628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2116628f99d7SShri Abhyankar           idx    = pj[j];
2117628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2118628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2119628f99d7SShri Abhyankar         }
2120628f99d7SShri Abhyankar         sctx.rs = rs;
21219566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
212207b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2123628f99d7SShri Abhyankar         break;
2124628f99d7SShri Abhyankar 
2125628f99d7SShri Abhyankar       case 2:
2126628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2127628f99d7SShri Abhyankar           idx         = bjtmp[j];
2128628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2129628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2130628f99d7SShri Abhyankar         }
2131628f99d7SShri Abhyankar 
2132628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2133628f99d7SShri Abhyankar         idx    = r[row];
2134628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2135628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2136628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2137628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2138628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2139628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2140628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2141628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2142628f99d7SShri Abhyankar         }
2143628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2144628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2145628f99d7SShri Abhyankar 
2146628f99d7SShri Abhyankar         prow = *bjtmp++;
2147628f99d7SShri Abhyankar         while (prow < row) {
2148628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2149628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2150628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0) {
2151628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2152628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2153628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2154628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2155628f99d7SShri Abhyankar             ++pv;
2156628f99d7SShri Abhyankar             *pc1 = mul1;
2157628f99d7SShri Abhyankar             *pc2 = mul2;
2158628f99d7SShri Abhyankar 
2159628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2160628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2161628f99d7SShri Abhyankar               tmp = pv[j];
2162628f99d7SShri Abhyankar               idx = pj[j];
2163628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2164628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2165628f99d7SShri Abhyankar             }
21669566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2167628f99d7SShri Abhyankar           }
2168628f99d7SShri Abhyankar           prow = *bjtmp++;
2169628f99d7SShri Abhyankar         }
2170628f99d7SShri Abhyankar 
2171628f99d7SShri Abhyankar         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2172628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2173628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2174628f99d7SShri Abhyankar 
2175628f99d7SShri Abhyankar         sctx.pv = *pc1;
2176628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2177628f99d7SShri Abhyankar         rs      = 0.0;
2178628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2179628f99d7SShri Abhyankar           idx = pj[j];
2180628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2181628f99d7SShri Abhyankar         }
2182628f99d7SShri Abhyankar         sctx.rs = rs;
21839566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
218407b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2185628f99d7SShri Abhyankar 
2186628f99d7SShri Abhyankar         if (*pc2 != 0.0) {
2187628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2188628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2189628f99d7SShri Abhyankar           *pc2   = mul2;
2190628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2191628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2192628f99d7SShri Abhyankar             idx = pj[j];
2193628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2194628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2195628f99d7SShri Abhyankar           }
21969566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2197628f99d7SShri Abhyankar         }
2198628f99d7SShri Abhyankar 
2199628f99d7SShri Abhyankar         pj  = bj + bi[row];
2200628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2201628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2202628f99d7SShri Abhyankar 
2203628f99d7SShri Abhyankar         sctx.pv         = rtmp22[row + 1];
2204628f99d7SShri Abhyankar         rs              = 0.0;
2205628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2206628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2207628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2208628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2209628f99d7SShri Abhyankar           idx    = pj[j];
2210628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2211628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2212628f99d7SShri Abhyankar           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2213628f99d7SShri Abhyankar         }
2214628f99d7SShri Abhyankar         sctx.rs = rs;
22159566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
221607b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2217628f99d7SShri Abhyankar         break;
2218628f99d7SShri Abhyankar 
2219628f99d7SShri Abhyankar       case 3:
2220628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2221628f99d7SShri Abhyankar           idx         = bjtmp[j];
2222628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2223628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2224628f99d7SShri Abhyankar           rtmp33[idx] = 0.0;
2225628f99d7SShri Abhyankar         }
2226628f99d7SShri Abhyankar         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2227628f99d7SShri Abhyankar         idx    = r[row];
2228628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2229628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2230628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2231628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2232628f99d7SShri Abhyankar         v3     = aa + ai[idx + 2];
2233628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2234628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2235628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2236628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2237628f99d7SShri Abhyankar           rtmp33[idx] = v3[j];
2238628f99d7SShri Abhyankar         }
2239628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2240628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2241628f99d7SShri Abhyankar         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2242628f99d7SShri Abhyankar 
2243628f99d7SShri Abhyankar         /* loop over all pivot row blocks above this row block */
2244628f99d7SShri Abhyankar         prow = *bjtmp++;
2245628f99d7SShri Abhyankar         while (prow < row) {
2246628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2247628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2248628f99d7SShri Abhyankar           pc3 = rtmp33 + prow;
2249628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2250628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2251628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2252628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2253628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2254628f99d7SShri Abhyankar             mul3 = *pc3 * *pv;
2255628f99d7SShri Abhyankar             ++pv;
2256628f99d7SShri Abhyankar             *pc1 = mul1;
2257628f99d7SShri Abhyankar             *pc2 = mul2;
2258628f99d7SShri Abhyankar             *pc3 = mul3;
2259628f99d7SShri Abhyankar 
2260628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2261628f99d7SShri Abhyankar             /* update this row based on pivot row */
2262628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2263628f99d7SShri Abhyankar               tmp = pv[j];
2264628f99d7SShri Abhyankar               idx = pj[j];
2265628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2266628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2267628f99d7SShri Abhyankar               rtmp33[idx] -= mul3 * tmp;
2268628f99d7SShri Abhyankar             }
22699566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2270628f99d7SShri Abhyankar           }
2271628f99d7SShri Abhyankar           prow = *bjtmp++;
2272628f99d7SShri Abhyankar         }
2273628f99d7SShri Abhyankar 
2274628f99d7SShri Abhyankar         /* Now take care of diagonal 3x3 block in this set of rows */
2275628f99d7SShri Abhyankar         /* note: prow = row here */
2276628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2277628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2278628f99d7SShri Abhyankar         pc3 = rtmp33 + prow;
2279628f99d7SShri Abhyankar 
2280628f99d7SShri Abhyankar         sctx.pv = *pc1;
2281628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2282628f99d7SShri Abhyankar         rs      = 0.0;
2283628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2284628f99d7SShri Abhyankar           idx = pj[j];
2285628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2286628f99d7SShri Abhyankar         }
2287628f99d7SShri Abhyankar         sctx.rs = rs;
22889566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
228907b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2290628f99d7SShri Abhyankar 
2291628f99d7SShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0) {
2292628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1);
2293628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc1);
2294628f99d7SShri Abhyankar           *pc2   = mul2;
2295628f99d7SShri Abhyankar           *pc3   = mul3;
2296628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2297628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2298628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2299628f99d7SShri Abhyankar             idx = pj[j];
2300628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2301628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2302628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2303628f99d7SShri Abhyankar           }
23049566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2305628f99d7SShri Abhyankar         }
2306628f99d7SShri Abhyankar         ++prow;
2307628f99d7SShri Abhyankar 
2308628f99d7SShri Abhyankar         pc2     = rtmp22 + prow;
2309628f99d7SShri Abhyankar         pc3     = rtmp33 + prow;
2310628f99d7SShri Abhyankar         sctx.pv = *pc2;
2311628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2312628f99d7SShri Abhyankar         rs      = 0.0;
2313628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2314628f99d7SShri Abhyankar           idx = pj[j];
2315628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2316628f99d7SShri Abhyankar         }
2317628f99d7SShri Abhyankar         sctx.rs = rs;
23189566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
231907b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2320628f99d7SShri Abhyankar 
2321628f99d7SShri Abhyankar         if (*pc3 != 0.0) {
2322628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc2);
2323628f99d7SShri Abhyankar           *pc3   = mul3;
2324628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2325628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2326628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2327628f99d7SShri Abhyankar             idx = pj[j];
2328628f99d7SShri Abhyankar             tmp = rtmp22[idx];
2329628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2330628f99d7SShri Abhyankar           }
23319566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2332628f99d7SShri Abhyankar         }
2333628f99d7SShri Abhyankar 
2334628f99d7SShri Abhyankar         pj  = bj + bi[row];
2335628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2336628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2337628f99d7SShri Abhyankar         pc3 = ba + bi[row + 2];
2338628f99d7SShri Abhyankar 
2339628f99d7SShri Abhyankar         sctx.pv         = rtmp33[row + 2];
2340628f99d7SShri Abhyankar         rs              = 0.0;
2341628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2342628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2343628f99d7SShri Abhyankar         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2344628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2345628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2346628f99d7SShri Abhyankar           idx    = pj[j];
2347628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2348628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2349628f99d7SShri Abhyankar           pc3[j] = rtmp33[idx];
2350628f99d7SShri Abhyankar           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2351628f99d7SShri Abhyankar         }
2352628f99d7SShri Abhyankar 
2353628f99d7SShri Abhyankar         sctx.rs = rs;
23549566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
235507b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2356628f99d7SShri Abhyankar         break;
2357628f99d7SShri Abhyankar 
2358d71ae5a4SJacob Faibussowitsch       default:
2359d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2360628f99d7SShri Abhyankar       }
2361628f99d7SShri Abhyankar       row += nodesz; /* Update the row */
2362628f99d7SShri Abhyankar     }
2363628f99d7SShri Abhyankar   endofwhile:;
236407b50cabSHong Zhang   } while (sctx.newshift);
23659566063dSJacob Faibussowitsch   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
23669566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
23679566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
23689566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
23699566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &c));
23702205254eSKarl Rupp 
2371d3ac4fa3SBarry Smith   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2372628f99d7SShri Abhyankar   /* do not set solve add, since MatSolve_Inode + Add is faster */
2373628f99d7SShri Abhyankar   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2374628f99d7SShri Abhyankar   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2375628f99d7SShri Abhyankar   C->assembled              = PETSC_TRUE;
2376628f99d7SShri Abhyankar   C->preallocated           = PETSC_TRUE;
2377628f99d7SShri Abhyankar   if (sctx.nshift) {
2378f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
23799566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2380f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
23819566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2382628f99d7SShri Abhyankar     }
2383628f99d7SShri Abhyankar   }
23849566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
23859566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCheckInode(C));
23863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2387628f99d7SShri Abhyankar }
2388ff6a9541SJacob Faibussowitsch #endif
2389628f99d7SShri Abhyankar 
2390d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2391d71ae5a4SJacob Faibussowitsch {
2392019b515eSShri Abhyankar   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2393019b515eSShri Abhyankar   IS                 iscol = a->col, isrow = a->row;
2394019b515eSShri Abhyankar   const PetscInt    *r, *c, *rout, *cout;
23958758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n;
23968758e1faSBarry Smith   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
23978758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2398019b515eSShri Abhyankar   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2399019b515eSShri Abhyankar   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2400019b515eSShri Abhyankar   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2401019b515eSShri Abhyankar   const PetscScalar *b;
2402019b515eSShri Abhyankar 
2403019b515eSShri Abhyankar   PetscFunctionBegin;
240408401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2405019b515eSShri Abhyankar   node_max = a->inode.node_count;
2406019b515eSShri Abhyankar   ns       = a->inode.size; /* Node Size array */
2407019b515eSShri Abhyankar 
24089566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
24099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
2410019b515eSShri Abhyankar   tmp = a->solve_work;
2411019b515eSShri Abhyankar 
24129371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
24139371c9d4SSatish Balay   r = rout;
24149371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
24159371c9d4SSatish Balay   c = cout;
2416019b515eSShri Abhyankar 
2417019b515eSShri Abhyankar   /* forward solve the lower triangular */
2418019b515eSShri Abhyankar   tmps = tmp;
2419019b515eSShri Abhyankar   aa   = a_a;
2420019b515eSShri Abhyankar   aj   = a_j;
2421019b515eSShri Abhyankar   ad   = a->diag;
2422019b515eSShri Abhyankar 
2423019b515eSShri Abhyankar   for (i = 0, row = 0; i < node_max; ++i) {
2424019b515eSShri Abhyankar     nsz = ns[i];
2425019b515eSShri Abhyankar     aii = ai[row];
2426019b515eSShri Abhyankar     v1  = aa + aii;
2427019b515eSShri Abhyankar     vi  = aj + aii;
2428019b515eSShri Abhyankar     nz  = ai[row + 1] - ai[row];
2429019b515eSShri Abhyankar 
243098991853SShri Abhyankar     if (i < node_max - 1) {
243198991853SShri Abhyankar       /* Prefetch the indices for the next block */
243250d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
243398991853SShri Abhyankar       /* Prefetch the data for the next block */
243450d8bf02SJed Brown       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
243598991853SShri Abhyankar     }
243698991853SShri Abhyankar 
2437019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2438019b515eSShri Abhyankar     case 1:
2439019b515eSShri Abhyankar       sum1 = b[r[row]];
2440019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2441019b515eSShri Abhyankar         i0   = vi[j];
2442019b515eSShri Abhyankar         i1   = vi[j + 1];
2443019b515eSShri Abhyankar         tmp0 = tmps[i0];
2444019b515eSShri Abhyankar         tmp1 = tmps[i1];
2445019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2446019b515eSShri Abhyankar       }
2447019b515eSShri Abhyankar       if (j == nz - 1) {
2448019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2449019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2450019b515eSShri Abhyankar       }
2451019b515eSShri Abhyankar       tmp[row++] = sum1;
2452019b515eSShri Abhyankar       break;
2453019b515eSShri Abhyankar     case 2:
2454019b515eSShri Abhyankar       sum1 = b[r[row]];
2455019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2456019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2457019b515eSShri Abhyankar 
2458019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2459019b515eSShri Abhyankar         i0   = vi[j];
2460019b515eSShri Abhyankar         i1   = vi[j + 1];
2461019b515eSShri Abhyankar         tmp0 = tmps[i0];
2462019b515eSShri Abhyankar         tmp1 = tmps[i1];
2463019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2464019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2465019b515eSShri Abhyankar       }
2466019b515eSShri Abhyankar       if (j == nz - 1) {
2467019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2468019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2469019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2470019b515eSShri Abhyankar       }
2471019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2472019b515eSShri Abhyankar       tmp[row++] = sum1;
2473019b515eSShri Abhyankar       tmp[row++] = sum2;
2474019b515eSShri Abhyankar       break;
2475019b515eSShri Abhyankar     case 3:
2476019b515eSShri Abhyankar       sum1 = b[r[row]];
2477019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2478019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2479019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2480019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2481019b515eSShri Abhyankar 
2482019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2483019b515eSShri Abhyankar         i0   = vi[j];
2484019b515eSShri Abhyankar         i1   = vi[j + 1];
2485019b515eSShri Abhyankar         tmp0 = tmps[i0];
2486019b515eSShri Abhyankar         tmp1 = tmps[i1];
2487019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2488019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2489019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2490019b515eSShri Abhyankar       }
2491019b515eSShri Abhyankar       if (j == nz - 1) {
2492019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2493019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2494019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2495019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2496019b515eSShri Abhyankar       }
2497019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2498019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2499019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2500019b515eSShri Abhyankar       tmp[row++] = sum1;
2501019b515eSShri Abhyankar       tmp[row++] = sum2;
2502019b515eSShri Abhyankar       tmp[row++] = sum3;
2503019b515eSShri Abhyankar       break;
2504019b515eSShri Abhyankar 
2505019b515eSShri Abhyankar     case 4:
2506019b515eSShri Abhyankar       sum1 = b[r[row]];
2507019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2508019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2509019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2510019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2511019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2512019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2513019b515eSShri Abhyankar 
2514019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2515019b515eSShri Abhyankar         i0   = vi[j];
2516019b515eSShri Abhyankar         i1   = vi[j + 1];
2517019b515eSShri Abhyankar         tmp0 = tmps[i0];
2518019b515eSShri Abhyankar         tmp1 = tmps[i1];
2519019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2520019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2521019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2522019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2523019b515eSShri Abhyankar       }
2524019b515eSShri Abhyankar       if (j == nz - 1) {
2525019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2526019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2527019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2528019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2529019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2530019b515eSShri Abhyankar       }
2531019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2532019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2533019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2534019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2535019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2536019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2537019b515eSShri Abhyankar 
2538019b515eSShri Abhyankar       tmp[row++] = sum1;
2539019b515eSShri Abhyankar       tmp[row++] = sum2;
2540019b515eSShri Abhyankar       tmp[row++] = sum3;
2541019b515eSShri Abhyankar       tmp[row++] = sum4;
2542019b515eSShri Abhyankar       break;
2543019b515eSShri Abhyankar     case 5:
2544019b515eSShri Abhyankar       sum1 = b[r[row]];
2545019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2546019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2547019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2548019b515eSShri Abhyankar       sum5 = b[r[row + 4]];
2549019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2550019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2551019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2552019b515eSShri Abhyankar       v5   = aa + ai[row + 4];
2553019b515eSShri Abhyankar 
2554019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2555019b515eSShri Abhyankar         i0   = vi[j];
2556019b515eSShri Abhyankar         i1   = vi[j + 1];
2557019b515eSShri Abhyankar         tmp0 = tmps[i0];
2558019b515eSShri Abhyankar         tmp1 = tmps[i1];
2559019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2560019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2561019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2562019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2563019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2564019b515eSShri Abhyankar       }
2565019b515eSShri Abhyankar       if (j == nz - 1) {
2566019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2567019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2568019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2569019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2570019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2571019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0;
2572019b515eSShri Abhyankar       }
2573019b515eSShri Abhyankar 
2574019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2575019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2576019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2577019b515eSShri Abhyankar       sum5 -= v5[nz] * sum1;
2578019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2579019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2580019b515eSShri Abhyankar       sum5 -= v5[nz + 1] * sum2;
2581019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2582019b515eSShri Abhyankar       sum5 -= v5[nz + 2] * sum3;
2583019b515eSShri Abhyankar       sum5 -= v5[nz + 3] * sum4;
2584019b515eSShri Abhyankar 
2585019b515eSShri Abhyankar       tmp[row++] = sum1;
2586019b515eSShri Abhyankar       tmp[row++] = sum2;
2587019b515eSShri Abhyankar       tmp[row++] = sum3;
2588019b515eSShri Abhyankar       tmp[row++] = sum4;
2589019b515eSShri Abhyankar       tmp[row++] = sum5;
2590019b515eSShri Abhyankar       break;
2591d71ae5a4SJacob Faibussowitsch     default:
2592d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2593019b515eSShri Abhyankar     }
2594019b515eSShri Abhyankar   }
2595019b515eSShri Abhyankar   /* backward solve the upper triangular */
2596019b515eSShri Abhyankar   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2597019b515eSShri Abhyankar     nsz = ns[i];
2598019b515eSShri Abhyankar     aii = ad[row + 1] + 1;
2599019b515eSShri Abhyankar     v1  = aa + aii;
2600019b515eSShri Abhyankar     vi  = aj + aii;
2601019b515eSShri Abhyankar     nz  = ad[row] - ad[row + 1] - 1;
260298991853SShri Abhyankar 
260398991853SShri Abhyankar     if (i > 0) {
260498991853SShri Abhyankar       /* Prefetch the indices for the next block */
260550d8bf02SJed Brown       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
260698991853SShri Abhyankar       /* Prefetch the data for the next block */
260750d8bf02SJed Brown       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
260898991853SShri Abhyankar     }
260998991853SShri Abhyankar 
2610019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2611019b515eSShri Abhyankar     case 1:
2612019b515eSShri Abhyankar       sum1 = tmp[row];
2613019b515eSShri Abhyankar 
2614019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2615019b515eSShri Abhyankar         i0   = vi[j];
2616019b515eSShri Abhyankar         i1   = vi[j + 1];
2617019b515eSShri Abhyankar         tmp0 = tmps[i0];
2618019b515eSShri Abhyankar         tmp1 = tmps[i1];
2619019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2620019b515eSShri Abhyankar       }
2621019b515eSShri Abhyankar       if (j == nz - 1) {
2622019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2623019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2624019b515eSShri Abhyankar       }
26259371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum1 * v1[nz];
26269371c9d4SSatish Balay       row--;
2627019b515eSShri Abhyankar       break;
2628019b515eSShri Abhyankar     case 2:
2629019b515eSShri Abhyankar       sum1 = tmp[row];
2630019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2631019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2632019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2633019b515eSShri Abhyankar         i0   = vi[j];
2634019b515eSShri Abhyankar         i1   = vi[j + 1];
2635019b515eSShri Abhyankar         tmp0 = tmps[i0];
2636019b515eSShri Abhyankar         tmp1 = tmps[i1];
2637019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2638019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2639019b515eSShri Abhyankar       }
2640019b515eSShri Abhyankar       if (j == nz - 1) {
2641019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2642019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2643019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2644019b515eSShri Abhyankar       }
2645019b515eSShri Abhyankar 
26469371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26479371c9d4SSatish Balay       row--;
2648019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
26499371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26509371c9d4SSatish Balay       row--;
2651019b515eSShri Abhyankar       break;
2652019b515eSShri Abhyankar     case 3:
2653019b515eSShri Abhyankar       sum1 = tmp[row];
2654019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2655019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2656019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2657019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2658019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2659019b515eSShri Abhyankar         i0   = vi[j];
2660019b515eSShri Abhyankar         i1   = vi[j + 1];
2661019b515eSShri Abhyankar         tmp0 = tmps[i0];
2662019b515eSShri Abhyankar         tmp1 = tmps[i1];
2663019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2664019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2665019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2666019b515eSShri Abhyankar       }
2667019b515eSShri Abhyankar       if (j == nz - 1) {
2668019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2669019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2670019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2671019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2672019b515eSShri Abhyankar       }
26739371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26749371c9d4SSatish Balay       row--;
2675019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2676019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
26779371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26789371c9d4SSatish Balay       row--;
2679019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
26809371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
26819371c9d4SSatish Balay       row--;
2682019b515eSShri Abhyankar 
2683019b515eSShri Abhyankar       break;
2684019b515eSShri Abhyankar     case 4:
2685019b515eSShri Abhyankar       sum1 = tmp[row];
2686019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2687019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2688019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2689019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2690019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2691019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2692019b515eSShri Abhyankar 
2693019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2694019b515eSShri Abhyankar         i0   = vi[j];
2695019b515eSShri Abhyankar         i1   = vi[j + 1];
2696019b515eSShri Abhyankar         tmp0 = tmps[i0];
2697019b515eSShri Abhyankar         tmp1 = tmps[i1];
2698019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2699019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2700019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2701019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2702019b515eSShri Abhyankar       }
2703019b515eSShri Abhyankar       if (j == nz - 1) {
2704019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2705019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2706019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2707019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2708019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2709019b515eSShri Abhyankar       }
2710019b515eSShri Abhyankar 
27119371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27129371c9d4SSatish Balay       row--;
2713019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2714019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2715019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
27169371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27179371c9d4SSatish Balay       row--;
2718019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2719019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
27209371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27219371c9d4SSatish Balay       row--;
2722019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
27239371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27249371c9d4SSatish Balay       row--;
2725019b515eSShri Abhyankar       break;
2726019b515eSShri Abhyankar     case 5:
2727019b515eSShri Abhyankar       sum1 = tmp[row];
2728019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2729019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2730019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2731019b515eSShri Abhyankar       sum5 = tmp[row - 4];
2732019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2733019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2734019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2735019b515eSShri Abhyankar       v5   = aa + ad[row - 3] + 1;
2736019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2737019b515eSShri Abhyankar         i0   = vi[j];
2738019b515eSShri Abhyankar         i1   = vi[j + 1];
2739019b515eSShri Abhyankar         tmp0 = tmps[i0];
2740019b515eSShri Abhyankar         tmp1 = tmps[i1];
2741019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2742019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2743019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2744019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2745019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2746019b515eSShri Abhyankar       }
2747019b515eSShri Abhyankar       if (j == nz - 1) {
2748019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2749019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2750019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2751019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2752019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2753019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0;
2754019b515eSShri Abhyankar       }
2755019b515eSShri Abhyankar 
27569371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27579371c9d4SSatish Balay       row--;
2758019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2759019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2760019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
2761019b515eSShri Abhyankar       sum5 -= v5[3] * tmp0;
27629371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27639371c9d4SSatish Balay       row--;
2764019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2765019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
2766019b515eSShri Abhyankar       sum5 -= v5[2] * tmp0;
27679371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27689371c9d4SSatish Balay       row--;
2769019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
2770019b515eSShri Abhyankar       sum5 -= v5[1] * tmp0;
27719371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27729371c9d4SSatish Balay       row--;
2773019b515eSShri Abhyankar       sum5 -= v5[0] * tmp0;
27749371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
27759371c9d4SSatish Balay       row--;
2776019b515eSShri Abhyankar       break;
2777d71ae5a4SJacob Faibussowitsch     default:
2778d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2779019b515eSShri Abhyankar     }
2780019b515eSShri Abhyankar   }
27819566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
27829566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
27839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
27849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
27859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
27863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2787019b515eSShri Abhyankar }
2788019b515eSShri Abhyankar 
27894c1414c8SBarry Smith /*
27904c1414c8SBarry Smith      Makes a longer coloring[] array and calls the usual code with that
27914c1414c8SBarry Smith */
279266976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2793d71ae5a4SJacob Faibussowitsch {
27944c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2795d0f46423SBarry Smith   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
27964c1414c8SBarry Smith   PetscInt        *colorused, i;
27974c1414c8SBarry Smith   ISColoringValue *newcolor;
27984c1414c8SBarry Smith 
27994c1414c8SBarry Smith   PetscFunctionBegin;
280008401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
28019566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &newcolor));
28024c1414c8SBarry Smith   /* loop over inodes, marking a color for each column*/
28034c1414c8SBarry Smith   row = 0;
28044c1414c8SBarry Smith   for (i = 0; i < m; i++) {
2805ad540459SPierre Jolivet     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
28064c1414c8SBarry Smith   }
28074c1414c8SBarry Smith 
28084c1414c8SBarry Smith   /* eliminate unneeded colors */
28099566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2810ad540459SPierre Jolivet   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
28114c1414c8SBarry Smith 
2812ad540459SPierre Jolivet   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
28134c1414c8SBarry Smith   ncolors = colorused[5 * ncolors - 1];
2814ad540459SPierre Jolivet   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
28159566063dSJacob Faibussowitsch   PetscCall(PetscFree(colorused));
28169566063dSJacob Faibussowitsch   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
28179566063dSJacob Faibussowitsch   PetscCall(PetscFree(coloring));
28183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28194c1414c8SBarry Smith }
28204c1414c8SBarry Smith 
2821af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
28222af78befSBarry Smith 
2823d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2824d71ae5a4SJacob Faibussowitsch {
28252af78befSBarry Smith   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
28267aaeff0aSMatthew G. Knepley   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
28275850ef23SBarry Smith   MatScalar         *ibdiag, *bdiag, work[25], *t;
2828a8b09249SBarry Smith   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
28297aaeff0aSMatthew G. Knepley   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
28305850ef23SBarry Smith   const PetscScalar *xb, *b;
28317b6c816cSBarry Smith   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
28328758e1faSBarry Smith   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
28338758e1faSBarry Smith   PetscInt           sz, k, ipvt[5];
28347b6c816cSBarry Smith   PetscBool          allowzeropivot, zeropivotdetected;
28358758e1faSBarry Smith   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
28362af78befSBarry Smith 
28372af78befSBarry Smith   PetscFunctionBegin;
2838a455e926SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
283908401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
284008401ef6SPierre Jolivet   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
284108401ef6SPierre Jolivet   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
28422af78befSBarry Smith 
284371f1c65dSBarry Smith   if (!a->inode.ibdiagvalid) {
28442af78befSBarry Smith     if (!a->inode.ibdiag) {
28452af78befSBarry Smith       /* calculate space needed for diagonal blocks */
2846ad540459SPierre Jolivet       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2847f0d39aaaSBarry Smith       a->inode.bdiagsize = cnt;
28482205254eSKarl Rupp 
28499566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
285071f1c65dSBarry Smith     }
285171f1c65dSBarry Smith 
285271f1c65dSBarry Smith     /* copy over the diagonal blocks and invert them */
28532af78befSBarry Smith     ibdiag = a->inode.ibdiag;
28542af78befSBarry Smith     bdiag  = a->inode.bdiag;
28552af78befSBarry Smith     cnt    = 0;
28562af78befSBarry Smith     for (i = 0, row = 0; i < m; i++) {
28572af78befSBarry Smith       for (j = 0; j < sizes[i]; j++) {
2858ad540459SPierre Jolivet         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
28592af78befSBarry Smith       }
28609566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
28612af78befSBarry Smith 
28622af78befSBarry Smith       switch (sizes[i]) {
28632af78befSBarry Smith       case 1:
28642af78befSBarry Smith         /* Create matrix data structure */
28658e0e2a9aSHong Zhang         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
28668e0e2a9aSHong Zhang           if (allowzeropivot) {
28677b6c816cSBarry Smith             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28687b6c816cSBarry Smith             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
28697b6c816cSBarry Smith             A->factorerror_zeropivot_row   = row;
28709566063dSJacob Faibussowitsch             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
287198921bdaSJacob Faibussowitsch           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
28728e0e2a9aSHong Zhang         }
287364c62002SMatthew Knepley         ibdiag[cnt] = 1.0 / ibdiag[cnt];
28742af78befSBarry Smith         break;
28752af78befSBarry Smith       case 2:
28769566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28777b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28782af78befSBarry Smith         break;
28792af78befSBarry Smith       case 3:
28809566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28817b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28822af78befSBarry Smith         break;
28832af78befSBarry Smith       case 4:
28849566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28857b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28862af78befSBarry Smith         break;
28872af78befSBarry Smith       case 5:
28889566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
28897b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28902af78befSBarry Smith         break;
2891d71ae5a4SJacob Faibussowitsch       default:
2892d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
28932af78befSBarry Smith       }
28942af78befSBarry Smith       cnt += sizes[i] * sizes[i];
28952af78befSBarry Smith       row += sizes[i];
28962af78befSBarry Smith     }
289771f1c65dSBarry Smith     a->inode.ibdiagvalid = PETSC_TRUE;
28982af78befSBarry Smith   }
28992af78befSBarry Smith   ibdiag = a->inode.ibdiag;
29002af78befSBarry Smith   bdiag  = a->inode.bdiag;
29015850ef23SBarry Smith   t      = a->inode.ssor_work;
29022af78befSBarry Smith 
29039566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
29049566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
29055850ef23SBarry Smith   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
29065850ef23SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
29072af78befSBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
29088862d2efSBarry Smith       for (i = 0, row = 0; i < m; i++) {
29098862d2efSBarry Smith         sz  = diag[row] - ii[row];
29108862d2efSBarry Smith         v1  = a->a + ii[row];
29118862d2efSBarry Smith         idx = a->j + ii[row];
29128862d2efSBarry Smith 
29134108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
29148862d2efSBarry Smith         switch (sizes[i]) {
29158862d2efSBarry Smith         case 1:
29168862d2efSBarry Smith 
29178862d2efSBarry Smith           sum1 = b[row];
29188862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
29198862d2efSBarry Smith             i1 = idx[0];
29208862d2efSBarry Smith             i2 = idx[1];
29218862d2efSBarry Smith             idx += 2;
29228862d2efSBarry Smith             tmp0 = x[i1];
29238862d2efSBarry Smith             tmp1 = x[i2];
29249371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29259371c9d4SSatish Balay             v1 += 2;
29268862d2efSBarry Smith           }
29278862d2efSBarry Smith 
29288862d2efSBarry Smith           if (n == sz - 1) {
2929f0d39aaaSBarry Smith             tmp0 = x[*idx];
2930f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
29318862d2efSBarry Smith           }
29325850ef23SBarry Smith           t[row]   = sum1;
29338862d2efSBarry Smith           x[row++] = sum1 * (*ibdiag++);
29348862d2efSBarry Smith           break;
2935f0d39aaaSBarry Smith         case 2:
2936f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2937f0d39aaaSBarry Smith           sum1 = b[row];
2938f0d39aaaSBarry Smith           sum2 = b[row + 1];
2939f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2940f0d39aaaSBarry Smith             i1 = idx[0];
2941f0d39aaaSBarry Smith             i2 = idx[1];
2942f0d39aaaSBarry Smith             idx += 2;
2943f0d39aaaSBarry Smith             tmp0 = x[i1];
2944f0d39aaaSBarry Smith             tmp1 = x[i2];
29459371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29469371c9d4SSatish Balay             v1 += 2;
29479371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29489371c9d4SSatish Balay             v2 += 2;
2949f0d39aaaSBarry Smith           }
2950f0d39aaaSBarry Smith 
2951f0d39aaaSBarry Smith           if (n == sz - 1) {
2952f0d39aaaSBarry Smith             tmp0 = x[*idx];
2953f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2954f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2955f0d39aaaSBarry Smith           }
29565850ef23SBarry Smith           t[row]     = sum1;
29575850ef23SBarry Smith           t[row + 1] = sum2;
2958f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2959f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2960f0d39aaaSBarry Smith           ibdiag += 4;
2961f0d39aaaSBarry Smith           break;
2962f0d39aaaSBarry Smith         case 3:
2963f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2964f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2965f0d39aaaSBarry Smith           sum1 = b[row];
2966f0d39aaaSBarry Smith           sum2 = b[row + 1];
2967f0d39aaaSBarry Smith           sum3 = b[row + 2];
2968f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2969f0d39aaaSBarry Smith             i1 = idx[0];
2970f0d39aaaSBarry Smith             i2 = idx[1];
2971f0d39aaaSBarry Smith             idx += 2;
2972f0d39aaaSBarry Smith             tmp0 = x[i1];
2973f0d39aaaSBarry Smith             tmp1 = x[i2];
29749371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29759371c9d4SSatish Balay             v1 += 2;
29769371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29779371c9d4SSatish Balay             v2 += 2;
29789371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
29799371c9d4SSatish Balay             v3 += 2;
2980f0d39aaaSBarry Smith           }
2981f0d39aaaSBarry Smith 
2982f0d39aaaSBarry Smith           if (n == sz - 1) {
2983f0d39aaaSBarry Smith             tmp0 = x[*idx];
2984f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2985f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2986f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
2987f0d39aaaSBarry Smith           }
29885850ef23SBarry Smith           t[row]     = sum1;
29895850ef23SBarry Smith           t[row + 1] = sum2;
29905850ef23SBarry Smith           t[row + 2] = sum3;
2991f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2992f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2993f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2994f0d39aaaSBarry Smith           ibdiag += 9;
2995f0d39aaaSBarry Smith           break;
2996f0d39aaaSBarry Smith         case 4:
2997f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2998f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2999f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3000f0d39aaaSBarry Smith           sum1 = b[row];
3001f0d39aaaSBarry Smith           sum2 = b[row + 1];
3002f0d39aaaSBarry Smith           sum3 = b[row + 2];
3003f0d39aaaSBarry Smith           sum4 = b[row + 3];
3004f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3005f0d39aaaSBarry Smith             i1 = idx[0];
3006f0d39aaaSBarry Smith             i2 = idx[1];
3007f0d39aaaSBarry Smith             idx += 2;
3008f0d39aaaSBarry Smith             tmp0 = x[i1];
3009f0d39aaaSBarry Smith             tmp1 = x[i2];
30109371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30119371c9d4SSatish Balay             v1 += 2;
30129371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30139371c9d4SSatish Balay             v2 += 2;
30149371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30159371c9d4SSatish Balay             v3 += 2;
30169371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30179371c9d4SSatish Balay             v4 += 2;
3018f0d39aaaSBarry Smith           }
3019f0d39aaaSBarry Smith 
3020f0d39aaaSBarry Smith           if (n == sz - 1) {
3021f0d39aaaSBarry Smith             tmp0 = x[*idx];
3022f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3023f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3024f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3025f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3026f0d39aaaSBarry Smith           }
30275850ef23SBarry Smith           t[row]     = sum1;
30285850ef23SBarry Smith           t[row + 1] = sum2;
30295850ef23SBarry Smith           t[row + 2] = sum3;
30305850ef23SBarry Smith           t[row + 3] = sum4;
3031f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3032f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3033f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3034f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3035f0d39aaaSBarry Smith           ibdiag += 16;
3036f0d39aaaSBarry Smith           break;
3037f0d39aaaSBarry Smith         case 5:
3038f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3039f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3040f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3041f0d39aaaSBarry Smith           v5   = a->a + ii[row + 4];
3042f0d39aaaSBarry Smith           sum1 = b[row];
3043f0d39aaaSBarry Smith           sum2 = b[row + 1];
3044f0d39aaaSBarry Smith           sum3 = b[row + 2];
3045f0d39aaaSBarry Smith           sum4 = b[row + 3];
3046f0d39aaaSBarry Smith           sum5 = b[row + 4];
3047f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3048f0d39aaaSBarry Smith             i1 = idx[0];
3049f0d39aaaSBarry Smith             i2 = idx[1];
3050f0d39aaaSBarry Smith             idx += 2;
3051f0d39aaaSBarry Smith             tmp0 = x[i1];
3052f0d39aaaSBarry Smith             tmp1 = x[i2];
30539371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30549371c9d4SSatish Balay             v1 += 2;
30559371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30569371c9d4SSatish Balay             v2 += 2;
30579371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30589371c9d4SSatish Balay             v3 += 2;
30599371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30609371c9d4SSatish Balay             v4 += 2;
30619371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
30629371c9d4SSatish Balay             v5 += 2;
3063f0d39aaaSBarry Smith           }
3064f0d39aaaSBarry Smith 
3065f0d39aaaSBarry Smith           if (n == sz - 1) {
3066f0d39aaaSBarry Smith             tmp0 = x[*idx];
3067f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3068f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3069f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3070f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3071f0d39aaaSBarry Smith             sum5 -= v5[0] * tmp0;
3072f0d39aaaSBarry Smith           }
30735850ef23SBarry Smith           t[row]     = sum1;
30745850ef23SBarry Smith           t[row + 1] = sum2;
30755850ef23SBarry Smith           t[row + 2] = sum3;
30765850ef23SBarry Smith           t[row + 3] = sum4;
30775850ef23SBarry Smith           t[row + 4] = sum5;
3078f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3079f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3080f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3081f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3082f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3083f0d39aaaSBarry Smith           ibdiag += 25;
3084f0d39aaaSBarry Smith           break;
3085d71ae5a4SJacob Faibussowitsch         default:
3086d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
30878862d2efSBarry Smith         }
30882af78befSBarry Smith       }
30892af78befSBarry Smith 
30905850ef23SBarry Smith       xb = t;
30919566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
30922af78befSBarry Smith     } else xb = b;
30932af78befSBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3094f0d39aaaSBarry Smith       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3095d0f46423SBarry Smith       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3096f0d39aaaSBarry Smith         ibdiag -= sizes[i] * sizes[i];
30978862d2efSBarry Smith         sz  = ii[row + 1] - diag[row] - 1;
30988862d2efSBarry Smith         v1  = a->a + diag[row] + 1;
30998862d2efSBarry Smith         idx = a->j + diag[row] + 1;
31002af78befSBarry Smith 
31014108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
31028862d2efSBarry Smith         switch (sizes[i]) {
31038862d2efSBarry Smith         case 1:
31048862d2efSBarry Smith 
31058862d2efSBarry Smith           sum1 = xb[row];
31068862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
31078862d2efSBarry Smith             i1 = idx[0];
31088862d2efSBarry Smith             i2 = idx[1];
31098862d2efSBarry Smith             idx += 2;
31108862d2efSBarry Smith             tmp0 = x[i1];
31118862d2efSBarry Smith             tmp1 = x[i2];
31129371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31139371c9d4SSatish Balay             v1 += 2;
31148862d2efSBarry Smith           }
31158862d2efSBarry Smith 
31168862d2efSBarry Smith           if (n == sz - 1) {
3117f0d39aaaSBarry Smith             tmp0 = x[*idx];
3118f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
31198862d2efSBarry Smith           }
3120f0d39aaaSBarry Smith           x[row--] = sum1 * (*ibdiag);
3121f0d39aaaSBarry Smith           break;
3122f0d39aaaSBarry Smith 
3123f0d39aaaSBarry Smith         case 2:
3124f0d39aaaSBarry Smith 
3125f0d39aaaSBarry Smith           sum1 = xb[row];
3126f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3127f0d39aaaSBarry Smith           /* note that sum1 is associated with the second of the two rows */
3128f0d39aaaSBarry Smith           v2 = a->a + diag[row - 1] + 2;
3129f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3130f0d39aaaSBarry Smith             i1 = idx[0];
3131f0d39aaaSBarry Smith             i2 = idx[1];
3132f0d39aaaSBarry Smith             idx += 2;
3133f0d39aaaSBarry Smith             tmp0 = x[i1];
3134f0d39aaaSBarry Smith             tmp1 = x[i2];
31359371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31369371c9d4SSatish Balay             v1 += 2;
31379371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31389371c9d4SSatish Balay             v2 += 2;
3139f0d39aaaSBarry Smith           }
3140f0d39aaaSBarry Smith 
3141f0d39aaaSBarry Smith           if (n == sz - 1) {
3142f0d39aaaSBarry Smith             tmp0 = x[*idx];
3143f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3144f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3145f0d39aaaSBarry Smith           }
3146f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3147f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3148f0d39aaaSBarry Smith           break;
3149f0d39aaaSBarry Smith         case 3:
3150f0d39aaaSBarry Smith 
3151f0d39aaaSBarry Smith           sum1 = xb[row];
3152f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3153f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3154f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3155f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3156f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3157f0d39aaaSBarry Smith             i1 = idx[0];
3158f0d39aaaSBarry Smith             i2 = idx[1];
3159f0d39aaaSBarry Smith             idx += 2;
3160f0d39aaaSBarry Smith             tmp0 = x[i1];
3161f0d39aaaSBarry Smith             tmp1 = x[i2];
31629371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31639371c9d4SSatish Balay             v1 += 2;
31649371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31659371c9d4SSatish Balay             v2 += 2;
31669371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31679371c9d4SSatish Balay             v3 += 2;
3168f0d39aaaSBarry Smith           }
3169f0d39aaaSBarry Smith 
3170f0d39aaaSBarry Smith           if (n == sz - 1) {
3171f0d39aaaSBarry Smith             tmp0 = x[*idx];
3172f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3173f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3174f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3175f0d39aaaSBarry Smith           }
3176f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3177f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3178f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3179f0d39aaaSBarry Smith           break;
3180f0d39aaaSBarry Smith         case 4:
3181f0d39aaaSBarry Smith 
3182f0d39aaaSBarry Smith           sum1 = xb[row];
3183f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3184f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3185f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3186f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3187f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3188f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3189f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3190f0d39aaaSBarry Smith             i1 = idx[0];
3191f0d39aaaSBarry Smith             i2 = idx[1];
3192f0d39aaaSBarry Smith             idx += 2;
3193f0d39aaaSBarry Smith             tmp0 = x[i1];
3194f0d39aaaSBarry Smith             tmp1 = x[i2];
31959371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31969371c9d4SSatish Balay             v1 += 2;
31979371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31989371c9d4SSatish Balay             v2 += 2;
31999371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32009371c9d4SSatish Balay             v3 += 2;
32019371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32029371c9d4SSatish Balay             v4 += 2;
3203f0d39aaaSBarry Smith           }
3204f0d39aaaSBarry Smith 
3205f0d39aaaSBarry Smith           if (n == sz - 1) {
3206f0d39aaaSBarry Smith             tmp0 = x[*idx];
3207f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3208f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3209f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3210f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3211f0d39aaaSBarry Smith           }
3212f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3213f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3214f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3215f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3216f0d39aaaSBarry Smith           break;
3217f0d39aaaSBarry Smith         case 5:
3218f0d39aaaSBarry Smith 
3219f0d39aaaSBarry Smith           sum1 = xb[row];
3220f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3221f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3222f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3223f0d39aaaSBarry Smith           sum5 = xb[row - 4];
3224f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3225f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3226f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3227f0d39aaaSBarry Smith           v5   = a->a + diag[row - 4] + 5;
3228f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3229f0d39aaaSBarry Smith             i1 = idx[0];
3230f0d39aaaSBarry Smith             i2 = idx[1];
3231f0d39aaaSBarry Smith             idx += 2;
3232f0d39aaaSBarry Smith             tmp0 = x[i1];
3233f0d39aaaSBarry Smith             tmp1 = x[i2];
32349371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32359371c9d4SSatish Balay             v1 += 2;
32369371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32379371c9d4SSatish Balay             v2 += 2;
32389371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32399371c9d4SSatish Balay             v3 += 2;
32409371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32419371c9d4SSatish Balay             v4 += 2;
32429371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
32439371c9d4SSatish Balay             v5 += 2;
3244f0d39aaaSBarry Smith           }
3245f0d39aaaSBarry Smith 
3246f0d39aaaSBarry Smith           if (n == sz - 1) {
3247f0d39aaaSBarry Smith             tmp0 = x[*idx];
3248f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3249f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3250f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3251f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3252f0d39aaaSBarry Smith             sum5 -= *v5 * tmp0;
3253f0d39aaaSBarry Smith           }
3254f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3255f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3256f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3257f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3258f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
32598862d2efSBarry Smith           break;
3260d71ae5a4SJacob Faibussowitsch         default:
3261d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
32628862d2efSBarry Smith         }
32632af78befSBarry Smith       }
32642af78befSBarry Smith 
32659566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
32662af78befSBarry Smith     }
32672af78befSBarry Smith     its--;
32685850ef23SBarry Smith   }
32695850ef23SBarry Smith   while (its--) {
32705850ef23SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
32719371c9d4SSatish Balay       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3272d876e2b0SMark Adams         sz  = diag[row] - ii[row];
32735850ef23SBarry Smith         v1  = a->a + ii[row];
32745850ef23SBarry Smith         idx = a->j + ii[row];
32755850ef23SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
32765850ef23SBarry Smith         switch (sizes[i]) {
32775850ef23SBarry Smith         case 1:
32785850ef23SBarry Smith           sum1 = b[row];
32795850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
32805850ef23SBarry Smith             i1 = idx[0];
32815850ef23SBarry Smith             i2 = idx[1];
32825850ef23SBarry Smith             idx += 2;
32835850ef23SBarry Smith             tmp0 = x[i1];
32845850ef23SBarry Smith             tmp1 = x[i2];
32859371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32869371c9d4SSatish Balay             v1 += 2;
32875850ef23SBarry Smith           }
32885850ef23SBarry Smith           if (n == sz - 1) {
3289d876e2b0SMark Adams             tmp0 = x[*idx++];
3290d876e2b0SMark Adams             sum1 -= *v1 * tmp0;
3291d876e2b0SMark Adams             v1++;
3292d876e2b0SMark Adams           }
3293d876e2b0SMark Adams           t[row] = sum1;
3294d876e2b0SMark Adams           sz     = ii[row + 1] - diag[row] - 1;
3295d876e2b0SMark Adams           idx    = a->j + diag[row] + 1;
3296d876e2b0SMark Adams           v1 += 1;
3297d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3298d876e2b0SMark Adams             i1 = idx[0];
3299d876e2b0SMark Adams             i2 = idx[1];
3300d876e2b0SMark Adams             idx += 2;
3301d876e2b0SMark Adams             tmp0 = x[i1];
3302d876e2b0SMark Adams             tmp1 = x[i2];
33039371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33049371c9d4SSatish Balay             v1 += 2;
3305d876e2b0SMark Adams           }
3306d876e2b0SMark Adams           if (n == sz - 1) {
3307d876e2b0SMark Adams             tmp0 = x[*idx++];
33085850ef23SBarry Smith             sum1 -= *v1 * tmp0;
33095850ef23SBarry Smith           }
33105850ef23SBarry Smith           /* in MatSOR_SeqAIJ this line would be
33115850ef23SBarry Smith            *
33125850ef23SBarry Smith            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
33135850ef23SBarry Smith            *
33145850ef23SBarry Smith            * but omega == 1, so this becomes
33155850ef23SBarry Smith            *
3316d876e2b0SMark Adams            * x[row] = sum1*(*ibdiag++);
33175850ef23SBarry Smith            *
33185850ef23SBarry Smith            */
3319d876e2b0SMark Adams           x[row] = sum1 * (*ibdiag);
33205850ef23SBarry Smith           break;
33215850ef23SBarry Smith         case 2:
33225850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33235850ef23SBarry Smith           sum1 = b[row];
33245850ef23SBarry Smith           sum2 = b[row + 1];
33255850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33265850ef23SBarry Smith             i1 = idx[0];
33275850ef23SBarry Smith             i2 = idx[1];
33285850ef23SBarry Smith             idx += 2;
33295850ef23SBarry Smith             tmp0 = x[i1];
33305850ef23SBarry Smith             tmp1 = x[i2];
33319371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33329371c9d4SSatish Balay             v1 += 2;
33339371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33349371c9d4SSatish Balay             v2 += 2;
33355850ef23SBarry Smith           }
3336d876e2b0SMark Adams           if (n == sz - 1) {
3337d876e2b0SMark Adams             tmp0 = x[*idx++];
3338d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3339d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
33409371c9d4SSatish Balay             v1++;
33419371c9d4SSatish Balay             v2++;
3342d876e2b0SMark Adams           }
3343d876e2b0SMark Adams           t[row]     = sum1;
3344d876e2b0SMark Adams           t[row + 1] = sum2;
3345d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 2;
3346d876e2b0SMark Adams           idx        = a->j + diag[row] + 2;
3347d876e2b0SMark Adams           v1 += 2;
3348d876e2b0SMark Adams           v2 += 2;
3349d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3350d876e2b0SMark Adams             i1 = idx[0];
3351d876e2b0SMark Adams             i2 = idx[1];
3352d876e2b0SMark Adams             idx += 2;
3353d876e2b0SMark Adams             tmp0 = x[i1];
3354d876e2b0SMark Adams             tmp1 = x[i2];
33559371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33569371c9d4SSatish Balay             v1 += 2;
33579371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33589371c9d4SSatish Balay             v2 += 2;
3359d876e2b0SMark Adams           }
33605850ef23SBarry Smith           if (n == sz - 1) {
33615850ef23SBarry Smith             tmp0 = x[*idx];
33625850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
33635850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
33645850ef23SBarry Smith           }
3365d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3366d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
33675850ef23SBarry Smith           break;
33685850ef23SBarry Smith         case 3:
33695850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33705850ef23SBarry Smith           v3   = a->a + ii[row + 2];
33715850ef23SBarry Smith           sum1 = b[row];
33725850ef23SBarry Smith           sum2 = b[row + 1];
33735850ef23SBarry Smith           sum3 = b[row + 2];
33745850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33755850ef23SBarry Smith             i1 = idx[0];
33765850ef23SBarry Smith             i2 = idx[1];
33775850ef23SBarry Smith             idx += 2;
33785850ef23SBarry Smith             tmp0 = x[i1];
33795850ef23SBarry Smith             tmp1 = x[i2];
33809371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33819371c9d4SSatish Balay             v1 += 2;
33829371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33839371c9d4SSatish Balay             v2 += 2;
33849371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
33859371c9d4SSatish Balay             v3 += 2;
33865850ef23SBarry Smith           }
3387d876e2b0SMark Adams           if (n == sz - 1) {
3388d876e2b0SMark Adams             tmp0 = x[*idx++];
3389d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3390d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3391d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
33929371c9d4SSatish Balay             v1++;
33939371c9d4SSatish Balay             v2++;
33949371c9d4SSatish Balay             v3++;
3395d876e2b0SMark Adams           }
3396d876e2b0SMark Adams           t[row]     = sum1;
3397d876e2b0SMark Adams           t[row + 1] = sum2;
3398d876e2b0SMark Adams           t[row + 2] = sum3;
3399d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 3;
3400d876e2b0SMark Adams           idx        = a->j + diag[row] + 3;
3401d876e2b0SMark Adams           v1 += 3;
3402d876e2b0SMark Adams           v2 += 3;
3403d876e2b0SMark Adams           v3 += 3;
3404d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3405d876e2b0SMark Adams             i1 = idx[0];
3406d876e2b0SMark Adams             i2 = idx[1];
3407d876e2b0SMark Adams             idx += 2;
3408d876e2b0SMark Adams             tmp0 = x[i1];
3409d876e2b0SMark Adams             tmp1 = x[i2];
34109371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34119371c9d4SSatish Balay             v1 += 2;
34129371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34139371c9d4SSatish Balay             v2 += 2;
34149371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34159371c9d4SSatish Balay             v3 += 2;
3416d876e2b0SMark Adams           }
34175850ef23SBarry Smith           if (n == sz - 1) {
34185850ef23SBarry Smith             tmp0 = x[*idx];
34195850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34205850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34215850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34225850ef23SBarry Smith           }
3423d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3424d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3425d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
34265850ef23SBarry Smith           break;
34275850ef23SBarry Smith         case 4:
34285850ef23SBarry Smith           v2   = a->a + ii[row + 1];
34295850ef23SBarry Smith           v3   = a->a + ii[row + 2];
34305850ef23SBarry Smith           v4   = a->a + ii[row + 3];
34315850ef23SBarry Smith           sum1 = b[row];
34325850ef23SBarry Smith           sum2 = b[row + 1];
34335850ef23SBarry Smith           sum3 = b[row + 2];
34345850ef23SBarry Smith           sum4 = b[row + 3];
34355850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
34365850ef23SBarry Smith             i1 = idx[0];
34375850ef23SBarry Smith             i2 = idx[1];
34385850ef23SBarry Smith             idx += 2;
34395850ef23SBarry Smith             tmp0 = x[i1];
34405850ef23SBarry Smith             tmp1 = x[i2];
34419371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34429371c9d4SSatish Balay             v1 += 2;
34439371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34449371c9d4SSatish Balay             v2 += 2;
34459371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34469371c9d4SSatish Balay             v3 += 2;
34479371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34489371c9d4SSatish Balay             v4 += 2;
34495850ef23SBarry Smith           }
3450d876e2b0SMark Adams           if (n == sz - 1) {
3451d876e2b0SMark Adams             tmp0 = x[*idx++];
3452d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3453d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3454d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3455d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
34569371c9d4SSatish Balay             v1++;
34579371c9d4SSatish Balay             v2++;
34589371c9d4SSatish Balay             v3++;
34599371c9d4SSatish Balay             v4++;
3460d876e2b0SMark Adams           }
3461d876e2b0SMark Adams           t[row]     = sum1;
3462d876e2b0SMark Adams           t[row + 1] = sum2;
3463d876e2b0SMark Adams           t[row + 2] = sum3;
3464d876e2b0SMark Adams           t[row + 3] = sum4;
3465d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 4;
3466d876e2b0SMark Adams           idx        = a->j + diag[row] + 4;
3467d876e2b0SMark Adams           v1 += 4;
3468d876e2b0SMark Adams           v2 += 4;
3469d876e2b0SMark Adams           v3 += 4;
3470d876e2b0SMark Adams           v4 += 4;
3471d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3472d876e2b0SMark Adams             i1 = idx[0];
3473d876e2b0SMark Adams             i2 = idx[1];
3474d876e2b0SMark Adams             idx += 2;
3475d876e2b0SMark Adams             tmp0 = x[i1];
3476d876e2b0SMark Adams             tmp1 = x[i2];
34779371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34789371c9d4SSatish Balay             v1 += 2;
34799371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34809371c9d4SSatish Balay             v2 += 2;
34819371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34829371c9d4SSatish Balay             v3 += 2;
34839371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34849371c9d4SSatish Balay             v4 += 2;
3485d876e2b0SMark Adams           }
34865850ef23SBarry Smith           if (n == sz - 1) {
34875850ef23SBarry Smith             tmp0 = x[*idx];
34885850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34895850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34905850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34915850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
34925850ef23SBarry Smith           }
3493d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3494d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3495d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3496d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
34975850ef23SBarry Smith           break;
34985850ef23SBarry Smith         case 5:
34995850ef23SBarry Smith           v2   = a->a + ii[row + 1];
35005850ef23SBarry Smith           v3   = a->a + ii[row + 2];
35015850ef23SBarry Smith           v4   = a->a + ii[row + 3];
35025850ef23SBarry Smith           v5   = a->a + ii[row + 4];
35035850ef23SBarry Smith           sum1 = b[row];
35045850ef23SBarry Smith           sum2 = b[row + 1];
35055850ef23SBarry Smith           sum3 = b[row + 2];
35065850ef23SBarry Smith           sum4 = b[row + 3];
35075850ef23SBarry Smith           sum5 = b[row + 4];
35085850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35095850ef23SBarry Smith             i1 = idx[0];
35105850ef23SBarry Smith             i2 = idx[1];
35115850ef23SBarry Smith             idx += 2;
35125850ef23SBarry Smith             tmp0 = x[i1];
35135850ef23SBarry Smith             tmp1 = x[i2];
35149371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35159371c9d4SSatish Balay             v1 += 2;
35169371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35179371c9d4SSatish Balay             v2 += 2;
35189371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35199371c9d4SSatish Balay             v3 += 2;
35209371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35219371c9d4SSatish Balay             v4 += 2;
35229371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35239371c9d4SSatish Balay             v5 += 2;
35245850ef23SBarry Smith           }
35255850ef23SBarry Smith           if (n == sz - 1) {
3526d876e2b0SMark Adams             tmp0 = x[*idx++];
35275850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35285850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35295850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35305850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35315850ef23SBarry Smith             sum5 -= v5[0] * tmp0;
35329371c9d4SSatish Balay             v1++;
35339371c9d4SSatish Balay             v2++;
35349371c9d4SSatish Balay             v3++;
35359371c9d4SSatish Balay             v4++;
35369371c9d4SSatish Balay             v5++;
35375850ef23SBarry Smith           }
3538d876e2b0SMark Adams           t[row]     = sum1;
3539d876e2b0SMark Adams           t[row + 1] = sum2;
3540d876e2b0SMark Adams           t[row + 2] = sum3;
3541d876e2b0SMark Adams           t[row + 3] = sum4;
3542d876e2b0SMark Adams           t[row + 4] = sum5;
3543d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 5;
3544d876e2b0SMark Adams           idx        = a->j + diag[row] + 5;
3545d876e2b0SMark Adams           v1 += 5;
3546d876e2b0SMark Adams           v2 += 5;
3547d876e2b0SMark Adams           v3 += 5;
3548d876e2b0SMark Adams           v4 += 5;
3549d876e2b0SMark Adams           v5 += 5;
35505850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35515850ef23SBarry Smith             i1 = idx[0];
35525850ef23SBarry Smith             i2 = idx[1];
35535850ef23SBarry Smith             idx += 2;
35545850ef23SBarry Smith             tmp0 = x[i1];
35555850ef23SBarry Smith             tmp1 = x[i2];
35569371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35579371c9d4SSatish Balay             v1 += 2;
35589371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35599371c9d4SSatish Balay             v2 += 2;
35609371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35619371c9d4SSatish Balay             v3 += 2;
35629371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35639371c9d4SSatish Balay             v4 += 2;
35649371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35659371c9d4SSatish Balay             v5 += 2;
35665850ef23SBarry Smith           }
35675850ef23SBarry Smith           if (n == sz - 1) {
35685850ef23SBarry Smith             tmp0 = x[*idx];
3569d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3570d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3571d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3572d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
3573d876e2b0SMark Adams             sum5 -= v5[0] * tmp0;
35745850ef23SBarry Smith           }
3575d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3576d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3577d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3578d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3579d876e2b0SMark Adams           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3580d876e2b0SMark Adams           break;
3581d71ae5a4SJacob Faibussowitsch         default:
3582d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3583d876e2b0SMark Adams         }
3584d876e2b0SMark Adams       }
3585d876e2b0SMark Adams       xb = t;
35869566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3587d876e2b0SMark Adams     } else xb = b;
3588d876e2b0SMark Adams 
3589d876e2b0SMark Adams     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3590d876e2b0SMark Adams       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3591d876e2b0SMark Adams       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3592d876e2b0SMark Adams         ibdiag -= sizes[i] * sizes[i];
3593d876e2b0SMark Adams 
3594d876e2b0SMark Adams         /* set RHS */
3595d876e2b0SMark Adams         if (xb == b) {
3596d876e2b0SMark Adams           /* whole (old way) */
3597d876e2b0SMark Adams           sz  = ii[row + 1] - ii[row];
3598d876e2b0SMark Adams           idx = a->j + ii[row];
3599d876e2b0SMark Adams           switch (sizes[i]) {
3600d71ae5a4SJacob Faibussowitsch           case 5:
3601d71ae5a4SJacob Faibussowitsch             v5 = a->a + ii[row - 4]; /* fall through */
3602d71ae5a4SJacob Faibussowitsch           case 4:
3603d71ae5a4SJacob Faibussowitsch             v4 = a->a + ii[row - 3]; /* fall through */
3604d71ae5a4SJacob Faibussowitsch           case 3:
3605d71ae5a4SJacob Faibussowitsch             v3 = a->a + ii[row - 2]; /* fall through */
3606d71ae5a4SJacob Faibussowitsch           case 2:
3607d71ae5a4SJacob Faibussowitsch             v2 = a->a + ii[row - 1]; /* fall through */
3608d71ae5a4SJacob Faibussowitsch           case 1:
3609d71ae5a4SJacob Faibussowitsch             v1 = a->a + ii[row];
3610d71ae5a4SJacob Faibussowitsch             break;
3611d71ae5a4SJacob Faibussowitsch           default:
3612d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3613d876e2b0SMark Adams           }
3614d876e2b0SMark Adams         } else {
3615d876e2b0SMark Adams           /* upper, no diag */
3616d876e2b0SMark Adams           sz  = ii[row + 1] - diag[row] - 1;
3617d876e2b0SMark Adams           idx = a->j + diag[row] + 1;
3618d876e2b0SMark Adams           switch (sizes[i]) {
3619d71ae5a4SJacob Faibussowitsch           case 5:
3620d71ae5a4SJacob Faibussowitsch             v5 = a->a + diag[row - 4] + 5; /* fall through */
3621d71ae5a4SJacob Faibussowitsch           case 4:
3622d71ae5a4SJacob Faibussowitsch             v4 = a->a + diag[row - 3] + 4; /* fall through */
3623d71ae5a4SJacob Faibussowitsch           case 3:
3624d71ae5a4SJacob Faibussowitsch             v3 = a->a + diag[row - 2] + 3; /* fall through */
3625d71ae5a4SJacob Faibussowitsch           case 2:
3626d71ae5a4SJacob Faibussowitsch             v2 = a->a + diag[row - 1] + 2; /* fall through */
3627d71ae5a4SJacob Faibussowitsch           case 1:
3628d71ae5a4SJacob Faibussowitsch             v1 = a->a + diag[row] + 1;
3629d876e2b0SMark Adams           }
3630d876e2b0SMark Adams         }
3631d876e2b0SMark Adams         /* set sum */
3632d876e2b0SMark Adams         switch (sizes[i]) {
3633d71ae5a4SJacob Faibussowitsch         case 5:
3634d71ae5a4SJacob Faibussowitsch           sum5 = xb[row - 4]; /* fall through */
3635d71ae5a4SJacob Faibussowitsch         case 4:
3636d71ae5a4SJacob Faibussowitsch           sum4 = xb[row - 3]; /* fall through */
3637d71ae5a4SJacob Faibussowitsch         case 3:
3638d71ae5a4SJacob Faibussowitsch           sum3 = xb[row - 2]; /* fall through */
3639d71ae5a4SJacob Faibussowitsch         case 2:
3640d71ae5a4SJacob Faibussowitsch           sum2 = xb[row - 1]; /* fall through */
3641d876e2b0SMark Adams         case 1:
3642d876e2b0SMark Adams           /* note that sum1 is associated with the last row */
3643d876e2b0SMark Adams           sum1 = xb[row];
3644d876e2b0SMark Adams         }
3645d876e2b0SMark Adams         /* do sums */
3646d876e2b0SMark Adams         for (n = 0; n < sz - 1; n += 2) {
3647d876e2b0SMark Adams           i1 = idx[0];
3648d876e2b0SMark Adams           i2 = idx[1];
3649d876e2b0SMark Adams           idx += 2;
3650d876e2b0SMark Adams           tmp0 = x[i1];
3651d876e2b0SMark Adams           tmp1 = x[i2];
3652d876e2b0SMark Adams           switch (sizes[i]) {
3653d71ae5a4SJacob Faibussowitsch           case 5:
3654d71ae5a4SJacob Faibussowitsch             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3655d71ae5a4SJacob Faibussowitsch             v5 += 2; /* fall through */
3656d71ae5a4SJacob Faibussowitsch           case 4:
3657d71ae5a4SJacob Faibussowitsch             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3658d71ae5a4SJacob Faibussowitsch             v4 += 2; /* fall through */
3659d71ae5a4SJacob Faibussowitsch           case 3:
3660d71ae5a4SJacob Faibussowitsch             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3661d71ae5a4SJacob Faibussowitsch             v3 += 2; /* fall through */
3662d71ae5a4SJacob Faibussowitsch           case 2:
3663d71ae5a4SJacob Faibussowitsch             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3664d71ae5a4SJacob Faibussowitsch             v2 += 2; /* fall through */
3665d71ae5a4SJacob Faibussowitsch           case 1:
3666d71ae5a4SJacob Faibussowitsch             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3667d71ae5a4SJacob Faibussowitsch             v1 += 2;
3668d876e2b0SMark Adams           }
3669d876e2b0SMark Adams         }
3670d876e2b0SMark Adams         /* ragged edge */
3671d876e2b0SMark Adams         if (n == sz - 1) {
3672d876e2b0SMark Adams           tmp0 = x[*idx];
3673d876e2b0SMark Adams           switch (sizes[i]) {
3674d71ae5a4SJacob Faibussowitsch           case 5:
3675d71ae5a4SJacob Faibussowitsch             sum5 -= *v5 * tmp0; /* fall through */
3676d71ae5a4SJacob Faibussowitsch           case 4:
3677d71ae5a4SJacob Faibussowitsch             sum4 -= *v4 * tmp0; /* fall through */
3678d71ae5a4SJacob Faibussowitsch           case 3:
3679d71ae5a4SJacob Faibussowitsch             sum3 -= *v3 * tmp0; /* fall through */
3680d71ae5a4SJacob Faibussowitsch           case 2:
3681d71ae5a4SJacob Faibussowitsch             sum2 -= *v2 * tmp0; /* fall through */
3682d71ae5a4SJacob Faibussowitsch           case 1:
3683d71ae5a4SJacob Faibussowitsch             sum1 -= *v1 * tmp0;
3684d876e2b0SMark Adams           }
3685d876e2b0SMark Adams         }
3686d876e2b0SMark Adams         /* update */
3687d876e2b0SMark Adams         if (xb == b) {
3688d876e2b0SMark Adams           /* whole (old way) w/ diag */
3689d876e2b0SMark Adams           switch (sizes[i]) {
3690d876e2b0SMark Adams           case 5:
36915850ef23SBarry Smith             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
36925850ef23SBarry Smith             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
36935850ef23SBarry Smith             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
36945850ef23SBarry Smith             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
36955850ef23SBarry Smith             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
36965850ef23SBarry Smith             break;
3697d876e2b0SMark Adams           case 4:
3698d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3699d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3700d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3701d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3702d876e2b0SMark Adams             break;
3703d876e2b0SMark Adams           case 3:
3704d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3705d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3706d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3707d876e2b0SMark Adams             break;
3708d876e2b0SMark Adams           case 2:
3709d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3710d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3711d876e2b0SMark Adams             break;
3712d71ae5a4SJacob Faibussowitsch           case 1:
3713d71ae5a4SJacob Faibussowitsch             x[row--] += sum1 * (*ibdiag);
3714d71ae5a4SJacob Faibussowitsch             break;
3715d876e2b0SMark Adams           }
3716d876e2b0SMark Adams         } else {
3717d876e2b0SMark Adams           /* no diag so set =  */
3718d876e2b0SMark Adams           switch (sizes[i]) {
3719d876e2b0SMark Adams           case 5:
3720d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3721d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3722d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3723d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3724d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3725d876e2b0SMark Adams             break;
3726d876e2b0SMark Adams           case 4:
3727d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3728d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3729d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3730d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3731d876e2b0SMark Adams             break;
3732d876e2b0SMark Adams           case 3:
3733d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3734d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3735d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3736d876e2b0SMark Adams             break;
3737d876e2b0SMark Adams           case 2:
3738d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3739d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3740d876e2b0SMark Adams             break;
3741d71ae5a4SJacob Faibussowitsch           case 1:
3742d71ae5a4SJacob Faibussowitsch             x[row--] = sum1 * (*ibdiag);
3743d71ae5a4SJacob Faibussowitsch             break;
37445850ef23SBarry Smith           }
37455850ef23SBarry Smith         }
3746d876e2b0SMark Adams       }
3747d876e2b0SMark Adams       if (xb == b) {
37489566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
3749d876e2b0SMark Adams       } else {
37509566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3751d876e2b0SMark Adams       }
37525850ef23SBarry Smith     }
37532af78befSBarry Smith   }
375489c6957cSBarry Smith   if (flag & SOR_EISENSTAT) {
375589c6957cSBarry Smith     /*
375689c6957cSBarry Smith           Apply  (U + D)^-1  where D is now the block diagonal
375789c6957cSBarry Smith     */
375889c6957cSBarry Smith     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
375989c6957cSBarry Smith     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
376089c6957cSBarry Smith       ibdiag -= sizes[i] * sizes[i];
376189c6957cSBarry Smith       sz  = ii[row + 1] - diag[row] - 1;
376289c6957cSBarry Smith       v1  = a->a + diag[row] + 1;
376389c6957cSBarry Smith       idx = a->j + diag[row] + 1;
37644108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
376589c6957cSBarry Smith       switch (sizes[i]) {
376689c6957cSBarry Smith       case 1:
376789c6957cSBarry Smith 
376889c6957cSBarry Smith         sum1 = b[row];
376989c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
377089c6957cSBarry Smith           i1 = idx[0];
377189c6957cSBarry Smith           i2 = idx[1];
377289c6957cSBarry Smith           idx += 2;
377389c6957cSBarry Smith           tmp0 = x[i1];
377489c6957cSBarry Smith           tmp1 = x[i2];
37759371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37769371c9d4SSatish Balay           v1 += 2;
377789c6957cSBarry Smith         }
377889c6957cSBarry Smith 
377989c6957cSBarry Smith         if (n == sz - 1) {
378089c6957cSBarry Smith           tmp0 = x[*idx];
378189c6957cSBarry Smith           sum1 -= *v1 * tmp0;
378289c6957cSBarry Smith         }
37839371c9d4SSatish Balay         x[row] = sum1 * (*ibdiag);
37849371c9d4SSatish Balay         row--;
378589c6957cSBarry Smith         break;
378689c6957cSBarry Smith 
378789c6957cSBarry Smith       case 2:
378889c6957cSBarry Smith 
378989c6957cSBarry Smith         sum1 = b[row];
379089c6957cSBarry Smith         sum2 = b[row - 1];
379189c6957cSBarry Smith         /* note that sum1 is associated with the second of the two rows */
379289c6957cSBarry Smith         v2 = a->a + diag[row - 1] + 2;
379389c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
379489c6957cSBarry Smith           i1 = idx[0];
379589c6957cSBarry Smith           i2 = idx[1];
379689c6957cSBarry Smith           idx += 2;
379789c6957cSBarry Smith           tmp0 = x[i1];
379889c6957cSBarry Smith           tmp1 = x[i2];
37999371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38009371c9d4SSatish Balay           v1 += 2;
38019371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38029371c9d4SSatish Balay           v2 += 2;
380389c6957cSBarry Smith         }
380489c6957cSBarry Smith 
380589c6957cSBarry Smith         if (n == sz - 1) {
380689c6957cSBarry Smith           tmp0 = x[*idx];
380789c6957cSBarry Smith           sum1 -= *v1 * tmp0;
380889c6957cSBarry Smith           sum2 -= *v2 * tmp0;
380989c6957cSBarry Smith         }
3810938d4eb3SBarry Smith         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3811938d4eb3SBarry Smith         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3812938d4eb3SBarry Smith         row -= 2;
381389c6957cSBarry Smith         break;
381489c6957cSBarry Smith       case 3:
381589c6957cSBarry Smith 
381689c6957cSBarry Smith         sum1 = b[row];
381789c6957cSBarry Smith         sum2 = b[row - 1];
381889c6957cSBarry Smith         sum3 = b[row - 2];
381989c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
382089c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
382189c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
382289c6957cSBarry Smith           i1 = idx[0];
382389c6957cSBarry Smith           i2 = idx[1];
382489c6957cSBarry Smith           idx += 2;
382589c6957cSBarry Smith           tmp0 = x[i1];
382689c6957cSBarry Smith           tmp1 = x[i2];
38279371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38289371c9d4SSatish Balay           v1 += 2;
38299371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38309371c9d4SSatish Balay           v2 += 2;
38319371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38329371c9d4SSatish Balay           v3 += 2;
383389c6957cSBarry Smith         }
383489c6957cSBarry Smith 
383589c6957cSBarry Smith         if (n == sz - 1) {
383689c6957cSBarry Smith           tmp0 = x[*idx];
383789c6957cSBarry Smith           sum1 -= *v1 * tmp0;
383889c6957cSBarry Smith           sum2 -= *v2 * tmp0;
383989c6957cSBarry Smith           sum3 -= *v3 * tmp0;
384089c6957cSBarry Smith         }
3841938d4eb3SBarry Smith         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3842938d4eb3SBarry Smith         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3843938d4eb3SBarry Smith         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3844938d4eb3SBarry Smith         row -= 3;
384589c6957cSBarry Smith         break;
384689c6957cSBarry Smith       case 4:
384789c6957cSBarry Smith 
384889c6957cSBarry Smith         sum1 = b[row];
384989c6957cSBarry Smith         sum2 = b[row - 1];
385089c6957cSBarry Smith         sum3 = b[row - 2];
385189c6957cSBarry Smith         sum4 = b[row - 3];
385289c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
385389c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
385489c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
385589c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
385689c6957cSBarry Smith           i1 = idx[0];
385789c6957cSBarry Smith           i2 = idx[1];
385889c6957cSBarry Smith           idx += 2;
385989c6957cSBarry Smith           tmp0 = x[i1];
386089c6957cSBarry Smith           tmp1 = x[i2];
38619371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38629371c9d4SSatish Balay           v1 += 2;
38639371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38649371c9d4SSatish Balay           v2 += 2;
38659371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38669371c9d4SSatish Balay           v3 += 2;
38679371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
38689371c9d4SSatish Balay           v4 += 2;
386989c6957cSBarry Smith         }
387089c6957cSBarry Smith 
387189c6957cSBarry Smith         if (n == sz - 1) {
387289c6957cSBarry Smith           tmp0 = x[*idx];
387389c6957cSBarry Smith           sum1 -= *v1 * tmp0;
387489c6957cSBarry Smith           sum2 -= *v2 * tmp0;
387589c6957cSBarry Smith           sum3 -= *v3 * tmp0;
387689c6957cSBarry Smith           sum4 -= *v4 * tmp0;
387789c6957cSBarry Smith         }
3878938d4eb3SBarry Smith         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3879938d4eb3SBarry Smith         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3880938d4eb3SBarry Smith         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3881938d4eb3SBarry Smith         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3882938d4eb3SBarry Smith         row -= 4;
388389c6957cSBarry Smith         break;
388489c6957cSBarry Smith       case 5:
388589c6957cSBarry Smith 
388689c6957cSBarry Smith         sum1 = b[row];
388789c6957cSBarry Smith         sum2 = b[row - 1];
388889c6957cSBarry Smith         sum3 = b[row - 2];
388989c6957cSBarry Smith         sum4 = b[row - 3];
389089c6957cSBarry Smith         sum5 = b[row - 4];
389189c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
389289c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
389389c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
389489c6957cSBarry Smith         v5   = a->a + diag[row - 4] + 5;
389589c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
389689c6957cSBarry Smith           i1 = idx[0];
389789c6957cSBarry Smith           i2 = idx[1];
389889c6957cSBarry Smith           idx += 2;
389989c6957cSBarry Smith           tmp0 = x[i1];
390089c6957cSBarry Smith           tmp1 = x[i2];
39019371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
39029371c9d4SSatish Balay           v1 += 2;
39039371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
39049371c9d4SSatish Balay           v2 += 2;
39059371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
39069371c9d4SSatish Balay           v3 += 2;
39079371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
39089371c9d4SSatish Balay           v4 += 2;
39099371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
39109371c9d4SSatish Balay           v5 += 2;
391189c6957cSBarry Smith         }
391289c6957cSBarry Smith 
391389c6957cSBarry Smith         if (n == sz - 1) {
391489c6957cSBarry Smith           tmp0 = x[*idx];
391589c6957cSBarry Smith           sum1 -= *v1 * tmp0;
391689c6957cSBarry Smith           sum2 -= *v2 * tmp0;
391789c6957cSBarry Smith           sum3 -= *v3 * tmp0;
391889c6957cSBarry Smith           sum4 -= *v4 * tmp0;
391989c6957cSBarry Smith           sum5 -= *v5 * tmp0;
392089c6957cSBarry Smith         }
3921938d4eb3SBarry Smith         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3922938d4eb3SBarry Smith         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3923938d4eb3SBarry Smith         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3924938d4eb3SBarry Smith         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3925938d4eb3SBarry Smith         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3926938d4eb3SBarry Smith         row -= 5;
392789c6957cSBarry Smith         break;
3928d71ae5a4SJacob Faibussowitsch       default:
3929d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
393089c6957cSBarry Smith       }
393189c6957cSBarry Smith     }
39329566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
393389c6957cSBarry Smith 
393489c6957cSBarry Smith     /*
393589c6957cSBarry Smith            t = b - D x    where D is the block diagonal
393689c6957cSBarry Smith     */
393789c6957cSBarry Smith     cnt = 0;
393889c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
393989c6957cSBarry Smith       switch (sizes[i]) {
394089c6957cSBarry Smith       case 1:
39419371c9d4SSatish Balay         t[row] = b[row] - bdiag[cnt++] * x[row];
39429371c9d4SSatish Balay         row++;
394389c6957cSBarry Smith         break;
394489c6957cSBarry Smith       case 2:
39459371c9d4SSatish Balay         x1         = x[row];
39469371c9d4SSatish Balay         x2         = x[row + 1];
394789c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
394889c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
394989c6957cSBarry Smith         t[row]     = b[row] - tmp1;
39509371c9d4SSatish Balay         t[row + 1] = b[row + 1] - tmp2;
39519371c9d4SSatish Balay         row += 2;
395289c6957cSBarry Smith         cnt += 4;
395389c6957cSBarry Smith         break;
395489c6957cSBarry Smith       case 3:
39559371c9d4SSatish Balay         x1         = x[row];
39569371c9d4SSatish Balay         x2         = x[row + 1];
39579371c9d4SSatish Balay         x3         = x[row + 2];
395889c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
395989c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
396089c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
396189c6957cSBarry Smith         t[row]     = b[row] - tmp1;
396289c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
39639371c9d4SSatish Balay         t[row + 2] = b[row + 2] - tmp3;
39649371c9d4SSatish Balay         row += 3;
396589c6957cSBarry Smith         cnt += 9;
396689c6957cSBarry Smith         break;
396789c6957cSBarry Smith       case 4:
39689371c9d4SSatish Balay         x1         = x[row];
39699371c9d4SSatish Balay         x2         = x[row + 1];
39709371c9d4SSatish Balay         x3         = x[row + 2];
39719371c9d4SSatish Balay         x4         = x[row + 3];
397289c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
397389c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
397489c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
397589c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
397689c6957cSBarry Smith         t[row]     = b[row] - tmp1;
397789c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
397889c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
39799371c9d4SSatish Balay         t[row + 3] = b[row + 3] - tmp4;
39809371c9d4SSatish Balay         row += 4;
398189c6957cSBarry Smith         cnt += 16;
398289c6957cSBarry Smith         break;
398389c6957cSBarry Smith       case 5:
39849371c9d4SSatish Balay         x1         = x[row];
39859371c9d4SSatish Balay         x2         = x[row + 1];
39869371c9d4SSatish Balay         x3         = x[row + 2];
39879371c9d4SSatish Balay         x4         = x[row + 3];
39889371c9d4SSatish Balay         x5         = x[row + 4];
398989c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
399089c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
399189c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
399289c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
399389c6957cSBarry Smith         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
399489c6957cSBarry Smith         t[row]     = b[row] - tmp1;
399589c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
399689c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
399789c6957cSBarry Smith         t[row + 3] = b[row + 3] - tmp4;
39989371c9d4SSatish Balay         t[row + 4] = b[row + 4] - tmp5;
39999371c9d4SSatish Balay         row += 5;
400089c6957cSBarry Smith         cnt += 25;
400189c6957cSBarry Smith         break;
4002d71ae5a4SJacob Faibussowitsch       default:
4003d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
400489c6957cSBarry Smith       }
400589c6957cSBarry Smith     }
40069566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(m));
400789c6957cSBarry Smith 
400889c6957cSBarry Smith     /*
400989c6957cSBarry Smith           Apply (L + D)^-1 where D is the block diagonal
401089c6957cSBarry Smith     */
401189c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
401289c6957cSBarry Smith       sz  = diag[row] - ii[row];
401389c6957cSBarry Smith       v1  = a->a + ii[row];
401489c6957cSBarry Smith       idx = a->j + ii[row];
40154108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
401689c6957cSBarry Smith       switch (sizes[i]) {
401789c6957cSBarry Smith       case 1:
401889c6957cSBarry Smith 
401989c6957cSBarry Smith         sum1 = t[row];
402089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
402189c6957cSBarry Smith           i1 = idx[0];
402289c6957cSBarry Smith           i2 = idx[1];
402389c6957cSBarry Smith           idx += 2;
402489c6957cSBarry Smith           tmp0 = t[i1];
402589c6957cSBarry Smith           tmp1 = t[i2];
40269371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40279371c9d4SSatish Balay           v1 += 2;
402889c6957cSBarry Smith         }
402989c6957cSBarry Smith 
403089c6957cSBarry Smith         if (n == sz - 1) {
403189c6957cSBarry Smith           tmp0 = t[*idx];
403289c6957cSBarry Smith           sum1 -= *v1 * tmp0;
403389c6957cSBarry Smith         }
40349371c9d4SSatish Balay         x[row] += t[row] = sum1 * (*ibdiag++);
40359371c9d4SSatish Balay         row++;
403689c6957cSBarry Smith         break;
403789c6957cSBarry Smith       case 2:
403889c6957cSBarry Smith         v2   = a->a + ii[row + 1];
403989c6957cSBarry Smith         sum1 = t[row];
404089c6957cSBarry Smith         sum2 = t[row + 1];
404189c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
404289c6957cSBarry Smith           i1 = idx[0];
404389c6957cSBarry Smith           i2 = idx[1];
404489c6957cSBarry Smith           idx += 2;
404589c6957cSBarry Smith           tmp0 = t[i1];
404689c6957cSBarry Smith           tmp1 = t[i2];
40479371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40489371c9d4SSatish Balay           v1 += 2;
40499371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40509371c9d4SSatish Balay           v2 += 2;
405189c6957cSBarry Smith         }
405289c6957cSBarry Smith 
405389c6957cSBarry Smith         if (n == sz - 1) {
405489c6957cSBarry Smith           tmp0 = t[*idx];
405589c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
405689c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
405789c6957cSBarry Smith         }
405889c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
405989c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
40609371c9d4SSatish Balay         ibdiag += 4;
40619371c9d4SSatish Balay         row += 2;
406289c6957cSBarry Smith         break;
406389c6957cSBarry Smith       case 3:
406489c6957cSBarry Smith         v2   = a->a + ii[row + 1];
406589c6957cSBarry Smith         v3   = a->a + ii[row + 2];
406689c6957cSBarry Smith         sum1 = t[row];
406789c6957cSBarry Smith         sum2 = t[row + 1];
406889c6957cSBarry Smith         sum3 = t[row + 2];
406989c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
407089c6957cSBarry Smith           i1 = idx[0];
407189c6957cSBarry Smith           i2 = idx[1];
407289c6957cSBarry Smith           idx += 2;
407389c6957cSBarry Smith           tmp0 = t[i1];
407489c6957cSBarry Smith           tmp1 = t[i2];
40759371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40769371c9d4SSatish Balay           v1 += 2;
40779371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40789371c9d4SSatish Balay           v2 += 2;
40799371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
40809371c9d4SSatish Balay           v3 += 2;
408189c6957cSBarry Smith         }
408289c6957cSBarry Smith 
408389c6957cSBarry Smith         if (n == sz - 1) {
408489c6957cSBarry Smith           tmp0 = t[*idx];
408589c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
408689c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
408789c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
408889c6957cSBarry Smith         }
408989c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
409089c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
409189c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
40929371c9d4SSatish Balay         ibdiag += 9;
40939371c9d4SSatish Balay         row += 3;
409489c6957cSBarry Smith         break;
409589c6957cSBarry Smith       case 4:
409689c6957cSBarry Smith         v2   = a->a + ii[row + 1];
409789c6957cSBarry Smith         v3   = a->a + ii[row + 2];
409889c6957cSBarry Smith         v4   = a->a + ii[row + 3];
409989c6957cSBarry Smith         sum1 = t[row];
410089c6957cSBarry Smith         sum2 = t[row + 1];
410189c6957cSBarry Smith         sum3 = t[row + 2];
410289c6957cSBarry Smith         sum4 = t[row + 3];
410389c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
410489c6957cSBarry Smith           i1 = idx[0];
410589c6957cSBarry Smith           i2 = idx[1];
410689c6957cSBarry Smith           idx += 2;
410789c6957cSBarry Smith           tmp0 = t[i1];
410889c6957cSBarry Smith           tmp1 = t[i2];
41099371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41109371c9d4SSatish Balay           v1 += 2;
41119371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41129371c9d4SSatish Balay           v2 += 2;
41139371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41149371c9d4SSatish Balay           v3 += 2;
41159371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41169371c9d4SSatish Balay           v4 += 2;
411789c6957cSBarry Smith         }
411889c6957cSBarry Smith 
411989c6957cSBarry Smith         if (n == sz - 1) {
412089c6957cSBarry Smith           tmp0 = t[*idx];
412189c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
412289c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
412389c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
412489c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
412589c6957cSBarry Smith         }
412689c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
412789c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
412889c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
412989c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
41309371c9d4SSatish Balay         ibdiag += 16;
41319371c9d4SSatish Balay         row += 4;
413289c6957cSBarry Smith         break;
413389c6957cSBarry Smith       case 5:
413489c6957cSBarry Smith         v2   = a->a + ii[row + 1];
413589c6957cSBarry Smith         v3   = a->a + ii[row + 2];
413689c6957cSBarry Smith         v4   = a->a + ii[row + 3];
413789c6957cSBarry Smith         v5   = a->a + ii[row + 4];
413889c6957cSBarry Smith         sum1 = t[row];
413989c6957cSBarry Smith         sum2 = t[row + 1];
414089c6957cSBarry Smith         sum3 = t[row + 2];
414189c6957cSBarry Smith         sum4 = t[row + 3];
414289c6957cSBarry Smith         sum5 = t[row + 4];
414389c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
414489c6957cSBarry Smith           i1 = idx[0];
414589c6957cSBarry Smith           i2 = idx[1];
414689c6957cSBarry Smith           idx += 2;
414789c6957cSBarry Smith           tmp0 = t[i1];
414889c6957cSBarry Smith           tmp1 = t[i2];
41499371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41509371c9d4SSatish Balay           v1 += 2;
41519371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41529371c9d4SSatish Balay           v2 += 2;
41539371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41549371c9d4SSatish Balay           v3 += 2;
41559371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41569371c9d4SSatish Balay           v4 += 2;
41579371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
41589371c9d4SSatish Balay           v5 += 2;
415989c6957cSBarry Smith         }
416089c6957cSBarry Smith 
416189c6957cSBarry Smith         if (n == sz - 1) {
416289c6957cSBarry Smith           tmp0 = t[*idx];
416389c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
416489c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
416589c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
416689c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
416789c6957cSBarry Smith           sum5 -= v5[0] * tmp0;
416889c6957cSBarry Smith         }
416989c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
417089c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
417189c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
417289c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
417389c6957cSBarry Smith         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
41749371c9d4SSatish Balay         ibdiag += 25;
41759371c9d4SSatish Balay         row += 5;
417689c6957cSBarry Smith         break;
4177d71ae5a4SJacob Faibussowitsch       default:
4178d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
417989c6957cSBarry Smith       }
418089c6957cSBarry Smith     }
41819566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
41825850ef23SBarry Smith   }
41839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
41849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
41853ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41862af78befSBarry Smith }
41872af78befSBarry Smith 
4188ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4189d71ae5a4SJacob Faibussowitsch {
419089c6957cSBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
419189c6957cSBarry Smith   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
419289c6957cSBarry Smith   const MatScalar   *bdiag = a->inode.bdiag;
419389c6957cSBarry Smith   const PetscScalar *b;
419489c6957cSBarry Smith   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
419589c6957cSBarry Smith   const PetscInt    *sizes = a->inode.size;
41962af78befSBarry Smith 
419789c6957cSBarry Smith   PetscFunctionBegin;
419808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
41999566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42009566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
420189c6957cSBarry Smith   cnt = 0;
420289c6957cSBarry Smith   for (i = 0, row = 0; i < m; i++) {
420389c6957cSBarry Smith     switch (sizes[i]) {
420489c6957cSBarry Smith     case 1:
42059371c9d4SSatish Balay       x[row] = b[row] * bdiag[cnt++];
42069371c9d4SSatish Balay       row++;
420789c6957cSBarry Smith       break;
420889c6957cSBarry Smith     case 2:
42099371c9d4SSatish Balay       x1       = b[row];
42109371c9d4SSatish Balay       x2       = b[row + 1];
421189c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
421289c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
421389c6957cSBarry Smith       x[row++] = tmp1;
421489c6957cSBarry Smith       x[row++] = tmp2;
421589c6957cSBarry Smith       cnt += 4;
421689c6957cSBarry Smith       break;
421789c6957cSBarry Smith     case 3:
42189371c9d4SSatish Balay       x1       = b[row];
42199371c9d4SSatish Balay       x2       = b[row + 1];
42209371c9d4SSatish Balay       x3       = b[row + 2];
422189c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
422289c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
422389c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
422489c6957cSBarry Smith       x[row++] = tmp1;
422589c6957cSBarry Smith       x[row++] = tmp2;
422689c6957cSBarry Smith       x[row++] = tmp3;
422789c6957cSBarry Smith       cnt += 9;
422889c6957cSBarry Smith       break;
422989c6957cSBarry Smith     case 4:
42309371c9d4SSatish Balay       x1       = b[row];
42319371c9d4SSatish Balay       x2       = b[row + 1];
42329371c9d4SSatish Balay       x3       = b[row + 2];
42339371c9d4SSatish Balay       x4       = b[row + 3];
423489c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
423589c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
423689c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
423789c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
423889c6957cSBarry Smith       x[row++] = tmp1;
423989c6957cSBarry Smith       x[row++] = tmp2;
424089c6957cSBarry Smith       x[row++] = tmp3;
424189c6957cSBarry Smith       x[row++] = tmp4;
424289c6957cSBarry Smith       cnt += 16;
424389c6957cSBarry Smith       break;
424489c6957cSBarry Smith     case 5:
42459371c9d4SSatish Balay       x1       = b[row];
42469371c9d4SSatish Balay       x2       = b[row + 1];
42479371c9d4SSatish Balay       x3       = b[row + 2];
42489371c9d4SSatish Balay       x4       = b[row + 3];
42499371c9d4SSatish Balay       x5       = b[row + 4];
425089c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
425189c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
425289c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
425389c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
425489c6957cSBarry Smith       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
425589c6957cSBarry Smith       x[row++] = tmp1;
425689c6957cSBarry Smith       x[row++] = tmp2;
425789c6957cSBarry Smith       x[row++] = tmp3;
425889c6957cSBarry Smith       x[row++] = tmp4;
425989c6957cSBarry Smith       x[row++] = tmp5;
426089c6957cSBarry Smith       cnt += 25;
426189c6957cSBarry Smith       break;
4262d71ae5a4SJacob Faibussowitsch     default:
4263d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
426489c6957cSBarry Smith     }
426589c6957cSBarry Smith   }
42669566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * cnt));
42679566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42689566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
42693ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
427089c6957cSBarry Smith }
427189c6957cSBarry Smith 
4272d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4273d71ae5a4SJacob Faibussowitsch {
4274b215bc84SStefano Zampini   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4275b215bc84SStefano Zampini 
4276b215bc84SStefano Zampini   PetscFunctionBegin;
4277b215bc84SStefano Zampini   a->inode.node_count       = 0;
4278b215bc84SStefano Zampini   a->inode.use              = PETSC_FALSE;
4279b215bc84SStefano Zampini   a->inode.checked          = PETSC_FALSE;
4280b215bc84SStefano Zampini   a->inode.mat_nonzerostate = -1;
4281b215bc84SStefano Zampini   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4282b215bc84SStefano Zampini   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4283b215bc84SStefano Zampini   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4284b215bc84SStefano Zampini   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4285b215bc84SStefano Zampini   A->ops->coloringpatch     = NULL;
4286b215bc84SStefano Zampini   A->ops->multdiagonalblock = NULL;
4287ad540459SPierre Jolivet   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
42883ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4289b215bc84SStefano Zampini }
4290b215bc84SStefano Zampini 
42914c1414c8SBarry Smith /*
42924c1414c8SBarry Smith     samestructure indicates that the matrix has not changed its nonzero structure so we
42934c1414c8SBarry Smith     do not need to recompute the inodes
42944c1414c8SBarry Smith */
4295d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A)
4296d71ae5a4SJacob Faibussowitsch {
42974c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
42988758e1faSBarry Smith   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4299ace3abfcSBarry Smith   PetscBool       flag;
43008758e1faSBarry Smith   const PetscInt *idx, *idy, *ii;
43014c1414c8SBarry Smith 
43024c1414c8SBarry Smith   PetscFunctionBegin;
4303b215bc84SStefano Zampini   if (!a->inode.use) {
43049566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43059566063dSJacob Faibussowitsch     PetscCall(PetscFree(a->inode.size));
43063ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
4307b215bc84SStefano Zampini   }
43083ba16761SJacob Faibussowitsch   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
43094c1414c8SBarry Smith 
4310d0f46423SBarry Smith   m = A->rmap->n;
43119566063dSJacob Faibussowitsch   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4312b215bc84SStefano Zampini   ns = a->inode.size;
43134c1414c8SBarry Smith 
43144c1414c8SBarry Smith   i          = 0;
43154c1414c8SBarry Smith   node_count = 0;
43164c1414c8SBarry Smith   idx        = a->j;
43174c1414c8SBarry Smith   ii         = a->i;
43186f2c871aSStefano Zampini   if (idx) {
43194c1414c8SBarry Smith     while (i < m) {            /* For each row */
43204c1414c8SBarry Smith       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
43214c1414c8SBarry Smith       /* Limits the number of elements in a node to 'a->inode.limit' */
43224c1414c8SBarry Smith       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
43234c1414c8SBarry Smith         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
43244c1414c8SBarry Smith         if (nzy != nzx) break;
43254c1414c8SBarry Smith         idy += nzx; /* Same nonzero pattern */
43269566063dSJacob Faibussowitsch         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
43274c1414c8SBarry Smith         if (!flag) break;
43284c1414c8SBarry Smith       }
43294c1414c8SBarry Smith       ns[node_count++] = blk_size;
43304c1414c8SBarry Smith       idx += blk_size * nzx;
43314c1414c8SBarry Smith       i = j;
43324c1414c8SBarry Smith     }
43336f2c871aSStefano Zampini   }
43344c1414c8SBarry Smith   /* If not enough inodes found,, do not use inode version of the routines */
43356f2c871aSStefano Zampini   if (!m || !idx || node_count > .8 * m) {
43369566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43379566063dSJacob Faibussowitsch     PetscCall(PetscFree(a->inode.size));
43389566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
43394c1414c8SBarry Smith   } else {
4340d5f3da31SBarry Smith     if (!A->factortype) {
4341375a6242SBarry Smith       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4342375a6242SBarry Smith       if (A->rmap->n == A->cmap->n) {
43434108e4d5SBarry Smith         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
43444108e4d5SBarry Smith         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
43454108e4d5SBarry Smith         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
43464108e4d5SBarry Smith         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
43474108e4d5SBarry Smith         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4348375a6242SBarry Smith       }
4349d3ac4fa3SBarry Smith     } else {
4350d3ac4fa3SBarry Smith       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4351d3ac4fa3SBarry Smith     }
43524c1414c8SBarry Smith     a->inode.node_count = node_count;
43539566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
43544c1414c8SBarry Smith   }
4355be6adb11SBarry Smith   a->inode.checked          = PETSC_TRUE;
4356a02bda8eSBarry Smith   a->inode.mat_nonzerostate = A->nonzerostate;
43573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43584c1414c8SBarry Smith }
43594c1414c8SBarry Smith 
4360d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4361d71ae5a4SJacob Faibussowitsch {
4362150f0143SBarry Smith   Mat         B = *C;
4363150f0143SBarry Smith   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4364150f0143SBarry Smith   PetscInt    m = A->rmap->n;
4365150f0143SBarry Smith 
4366150f0143SBarry Smith   PetscFunctionBegin;
4367150f0143SBarry Smith   c->inode.use              = a->inode.use;
4368150f0143SBarry Smith   c->inode.limit            = a->inode.limit;
4369150f0143SBarry Smith   c->inode.max_limit        = a->inode.max_limit;
4370ec710b6aSStefano Zampini   c->inode.checked          = PETSC_FALSE;
4371ec710b6aSStefano Zampini   c->inode.size             = NULL;
4372ec710b6aSStefano Zampini   c->inode.node_count       = 0;
4373ec710b6aSStefano Zampini   c->inode.ibdiagvalid      = PETSC_FALSE;
4374ec710b6aSStefano Zampini   c->inode.ibdiag           = NULL;
4375ec710b6aSStefano Zampini   c->inode.bdiag            = NULL;
4376ec710b6aSStefano Zampini   c->inode.mat_nonzerostate = -1;
4377b215bc84SStefano Zampini   if (a->inode.use) {
4378ec710b6aSStefano Zampini     if (a->inode.checked && a->inode.size) {
43799566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
43809566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4381ec710b6aSStefano Zampini 
4382ec710b6aSStefano Zampini       c->inode.checked          = PETSC_TRUE;
4383ec710b6aSStefano Zampini       c->inode.node_count       = a->inode.node_count;
4384ec710b6aSStefano Zampini       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4385ec710b6aSStefano Zampini     }
4386a02bda8eSBarry Smith     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
43872c451681SBarry Smith     if (!B->factortype) {
43882c451681SBarry Smith       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
43892c451681SBarry Smith       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
43902c451681SBarry Smith       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
43912c451681SBarry Smith       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
43922c451681SBarry Smith       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
43932c451681SBarry Smith       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4394150f0143SBarry Smith     } else {
43952c451681SBarry Smith       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4396150f0143SBarry Smith     }
4397150f0143SBarry Smith   }
43983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4399150f0143SBarry Smith }
4400150f0143SBarry Smith 
4401d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4402d71ae5a4SJacob Faibussowitsch {
44038758e1faSBarry Smith   PetscInt        k;
44048758e1faSBarry Smith   const PetscInt *vi;
44056e111a19SKarl Rupp 
440617454e89SShri Abhyankar   PetscFunctionBegin;
440717454e89SShri Abhyankar   vi = aj + ai[row];
440817454e89SShri Abhyankar   for (k = 0; k < nzl; k++) cols[k] = vi[k];
440917454e89SShri Abhyankar   vi        = aj + adiag[row];
441017454e89SShri Abhyankar   cols[nzl] = vi[0];
441117454e89SShri Abhyankar   vi        = aj + adiag[row + 1] + 1;
441217454e89SShri Abhyankar   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
44133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
441417454e89SShri Abhyankar }
44156936b636SHong Zhang /*
4416a02bda8eSBarry Smith    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4417a02bda8eSBarry Smith    Modified from MatSeqAIJCheckInode().
44186936b636SHong Zhang 
44196936b636SHong Zhang    Input Parameters:
4420abb87a52SBarry Smith .  Mat A - ILU or LU matrix factor
4421abb87a52SBarry Smith 
44226936b636SHong Zhang */
4423d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4424d71ae5a4SJacob Faibussowitsch {
4425019b515eSShri Abhyankar   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4426019b515eSShri Abhyankar   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
44278758e1faSBarry Smith   PetscInt       *cols1, *cols2, *ns;
44288758e1faSBarry Smith   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4429ace3abfcSBarry Smith   PetscBool       flag;
4430019b515eSShri Abhyankar 
4431019b515eSShri Abhyankar   PetscFunctionBegin;
44323ba16761SJacob Faibussowitsch   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
44333ba16761SJacob Faibussowitsch   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4434019b515eSShri Abhyankar 
4435019b515eSShri Abhyankar   m = A->rmap->n;
44362205254eSKarl Rupp   if (a->inode.size) ns = a->inode.size;
443748a46eb9SPierre Jolivet   else PetscCall(PetscMalloc1(m + 1, &ns));
4438019b515eSShri Abhyankar 
4439019b515eSShri Abhyankar   i          = 0;
4440019b515eSShri Abhyankar   node_count = 0;
44419566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4442019b515eSShri Abhyankar   while (i < m) {                       /* For each row */
4443019b515eSShri Abhyankar     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4444019b515eSShri Abhyankar     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4445019b515eSShri Abhyankar     nzx  = nzl1 + nzu1 + 1;
44463ba16761SJacob Faibussowitsch     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4447019b515eSShri Abhyankar 
4448019b515eSShri Abhyankar     /* Limits the number of elements in a node to 'a->inode.limit' */
4449019b515eSShri Abhyankar     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4450019b515eSShri Abhyankar       nzl2 = ai[j + 1] - ai[j];
4451019b515eSShri Abhyankar       nzu2 = adiag[j] - adiag[j + 1] - 1;
4452019b515eSShri Abhyankar       nzy  = nzl2 + nzu2 + 1;
4453019b515eSShri Abhyankar       if (nzy != nzx) break;
44549566063dSJacob Faibussowitsch       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
44559566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
44568758e1faSBarry Smith       if (!flag) break;
4457019b515eSShri Abhyankar     }
4458019b515eSShri Abhyankar     ns[node_count++] = blk_size;
4459019b515eSShri Abhyankar     i                = j;
4460019b515eSShri Abhyankar   }
44619566063dSJacob Faibussowitsch   PetscCall(PetscFree2(cols1, cols2));
4462019b515eSShri Abhyankar   /* If not enough inodes found,, do not use inode version of the routines */
4463be6adb11SBarry Smith   if (!m || node_count > .8 * m) {
44649566063dSJacob Faibussowitsch     PetscCall(PetscFree(ns));
44652205254eSKarl Rupp 
4466019b515eSShri Abhyankar     a->inode.node_count = 0;
44670298fd71SBarry Smith     a->inode.size       = NULL;
4468019b515eSShri Abhyankar     a->inode.use        = PETSC_FALSE;
44692205254eSKarl Rupp 
44709566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4471019b515eSShri Abhyankar   } else {
4472f4259b30SLisandro Dalcin     A->ops->mult              = NULL;
4473f4259b30SLisandro Dalcin     A->ops->sor               = NULL;
4474f4259b30SLisandro Dalcin     A->ops->multadd           = NULL;
4475f4259b30SLisandro Dalcin     A->ops->getrowij          = NULL;
4476f4259b30SLisandro Dalcin     A->ops->restorerowij      = NULL;
4477f4259b30SLisandro Dalcin     A->ops->getcolumnij       = NULL;
4478f4259b30SLisandro Dalcin     A->ops->restorecolumnij   = NULL;
4479f4259b30SLisandro Dalcin     A->ops->coloringpatch     = NULL;
4480f4259b30SLisandro Dalcin     A->ops->multdiagonalblock = NULL;
4481019b515eSShri Abhyankar     a->inode.node_count       = node_count;
4482019b515eSShri Abhyankar     a->inode.size             = ns;
44832205254eSKarl Rupp 
44849566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4485019b515eSShri Abhyankar   }
4486be6adb11SBarry Smith   a->inode.checked = PETSC_TRUE;
44873ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4488019b515eSShri Abhyankar }
4489019b515eSShri Abhyankar 
4490d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4491d71ae5a4SJacob Faibussowitsch {
4492acf2f550SJed Brown   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4493acf2f550SJed Brown 
4494acf2f550SJed Brown   PetscFunctionBegin;
4495acf2f550SJed Brown   a->inode.ibdiagvalid = PETSC_FALSE;
44963ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4497acf2f550SJed Brown }
4498acf2f550SJed Brown 
44994c1414c8SBarry Smith /*
45004c1414c8SBarry Smith      This is really ugly. if inodes are used this replaces the
45014c1414c8SBarry Smith   permutations with ones that correspond to rows/cols of the matrix
4502*467446fbSPierre Jolivet   rather than inode blocks
45034c1414c8SBarry Smith */
4504d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4505d71ae5a4SJacob Faibussowitsch {
45064c1414c8SBarry Smith   PetscFunctionBegin;
4507cac4c232SBarry Smith   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
45083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45094c1414c8SBarry Smith }
45104c1414c8SBarry Smith 
4511d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4512d71ae5a4SJacob Faibussowitsch {
45134c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
45145d0c19d7SBarry Smith   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
45155d0c19d7SBarry Smith   const PetscInt *ridx, *cidx;
45164c1414c8SBarry Smith   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
45174c1414c8SBarry Smith   PetscInt        nslim_col, *ns_col;
45184c1414c8SBarry Smith   IS              ris = *rperm, cis = *cperm;
45194c1414c8SBarry Smith 
45204c1414c8SBarry Smith   PetscFunctionBegin;
45213ba16761SJacob Faibussowitsch   if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS);           /* no inodes so return */
45223ba16761SJacob Faibussowitsch   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
45234c1414c8SBarry Smith 
45249566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
45259566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
45269566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &permr, n, &permc));
45274c1414c8SBarry Smith 
45289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(ris, &ridx));
45299566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(cis, &cidx));
45304c1414c8SBarry Smith 
4531baca6076SPierre Jolivet   /* Form the inode structure for the rows of permuted matrix using inv perm*/
45324c1414c8SBarry Smith   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
45334c1414c8SBarry Smith 
45344c1414c8SBarry Smith   /* Construct the permutations for rows*/
45354c1414c8SBarry Smith   for (i = 0, row = 0; i < nslim_row; ++i) {
45364c1414c8SBarry Smith     indx      = ridx[i];
45374c1414c8SBarry Smith     start_val = tns[indx];
45384c1414c8SBarry Smith     end_val   = tns[indx + 1];
45394c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
45404c1414c8SBarry Smith   }
45414c1414c8SBarry Smith 
45424c1414c8SBarry Smith   /* Form the inode structure for the columns of permuted matrix using inv perm*/
45434c1414c8SBarry Smith   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
45444c1414c8SBarry Smith 
45454c1414c8SBarry Smith   /* Construct permutations for columns */
45464c1414c8SBarry Smith   for (i = 0, col = 0; i < nslim_col; ++i) {
45474c1414c8SBarry Smith     indx      = cidx[i];
45484c1414c8SBarry Smith     start_val = tns[indx];
45494c1414c8SBarry Smith     end_val   = tns[indx + 1];
45504c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
45514c1414c8SBarry Smith   }
45524c1414c8SBarry Smith 
45539566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
45549566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*rperm));
45559566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
45569566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*cperm));
45574c1414c8SBarry Smith 
45589566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(ris, &ridx));
45599566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(cis, &cidx));
45604c1414c8SBarry Smith 
45619566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
45629566063dSJacob Faibussowitsch   PetscCall(PetscFree2(permr, permc));
45639566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&cis));
45649566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&ris));
45659566063dSJacob Faibussowitsch   PetscCall(PetscFree(tns));
45663ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45674c1414c8SBarry Smith }
45684c1414c8SBarry Smith 
45694c1414c8SBarry Smith /*@C
457011a5261eSBarry Smith   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
45714c1414c8SBarry Smith 
45723f9fe445SBarry Smith   Not Collective
45734c1414c8SBarry Smith 
45744c1414c8SBarry Smith   Input Parameter:
457511a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
45764c1414c8SBarry Smith 
4577d8d19677SJose E. Roman   Output Parameters:
45784c1414c8SBarry Smith + node_count - no of inodes present in the matrix.
45792ef1f0ffSBarry Smith . sizes      - an array of size `node_count`, with the sizes of each inode.
45804c1414c8SBarry Smith - limit      - the max size used to generate the inodes.
45814c1414c8SBarry Smith 
45824c1414c8SBarry Smith   Level: advanced
45834c1414c8SBarry Smith 
458411a5261eSBarry Smith   Note:
45854c1414c8SBarry Smith   It should be called after the matrix is assembled.
45864c1414c8SBarry Smith   The contents of the sizes[] array should not be changed.
45872ef1f0ffSBarry Smith   `NULL` may be passed for information not needed
45884c1414c8SBarry Smith 
45891cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
45904c1414c8SBarry Smith @*/
4591d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4592d71ae5a4SJacob Faibussowitsch {
45935f80ce2aSJacob Faibussowitsch   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
45944c1414c8SBarry Smith 
45954c1414c8SBarry Smith   PetscFunctionBegin;
45965f80ce2aSJacob Faibussowitsch   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
45979566063dSJacob Faibussowitsch   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
45989566063dSJacob Faibussowitsch   if (f) PetscCall((*f)(A, node_count, sizes, limit));
45993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46004c1414c8SBarry Smith }
46014c1414c8SBarry Smith 
4602d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4603d71ae5a4SJacob Faibussowitsch {
46044c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
46054c1414c8SBarry Smith 
46064c1414c8SBarry Smith   PetscFunctionBegin;
46074c1414c8SBarry Smith   if (node_count) *node_count = a->inode.node_count;
46084c1414c8SBarry Smith   if (sizes) *sizes = a->inode.size;
46094c1414c8SBarry Smith   if (limit) *limit = a->inode.limit;
46103ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46114c1414c8SBarry Smith }
4612