xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 4d12350b4d1d562f24a95bd06d554a68a71c004f)
14c1414c8SBarry Smith /*
24c1414c8SBarry Smith   This file provides high performance routines for the Inode format (compressed sparse row)
34c1414c8SBarry Smith   by taking advantage of rows with identical nonzero structure (I-nodes).
44c1414c8SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>
6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H)
7fb56d528SJed Brown   #include <xmmintrin.h>
8fb56d528SJed Brown #endif
94c1414c8SBarry Smith 
10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11d71ae5a4SJacob Faibussowitsch {
124c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
134c1414c8SBarry Smith   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
144c1414c8SBarry Smith 
154c1414c8SBarry Smith   PetscFunctionBegin;
16d0f46423SBarry Smith   n = A->cmap->n;
17d0f46423SBarry Smith   m = A->rmap->n;
18*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
19*4d12350bSJunchao Zhang   ns_row = a->inode.size_csr;
204c1414c8SBarry Smith 
214c1414c8SBarry Smith   min_mn = (m < n) ? m : n;
224c1414c8SBarry Smith   if (!ns) {
23*4d12350bSJunchao Zhang     for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24fbccb6d4SPierre Jolivet     for (; count + 1 < n; count++, i++);
25ad540459SPierre Jolivet     if (count < n) i++;
264c1414c8SBarry Smith     *size = i;
273ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
284c1414c8SBarry Smith   }
299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &ns_col));
30*4d12350bSJunchao Zhang   ns_col[0] = 0;
314c1414c8SBarry Smith 
324c1414c8SBarry Smith   /* Use the same row structure wherever feasible. */
33*4d12350bSJunchao Zhang   for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
344c1414c8SBarry Smith 
354c1414c8SBarry Smith   /* if m < n; pad up the remainder with inode_limit */
36*4d12350bSJunchao Zhang   for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37aaa8cc7dSPierre Jolivet   /* The last node is the odd ball. pad it up with the remaining rows; */
384c1414c8SBarry Smith   if (count < n) {
39*4d12350bSJunchao Zhang     ns_col[i + 1] = ns_col[i] + (n - count);
404c1414c8SBarry Smith     i++;
414c1414c8SBarry Smith   } else if (count > n) {
424c1414c8SBarry Smith     /* Adjust for the over estimation */
43*4d12350bSJunchao Zhang     ns_col[i] += n - count;
444c1414c8SBarry Smith   }
454c1414c8SBarry Smith   *size = i;
464c1414c8SBarry Smith   *ns   = ns_col;
473ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
484c1414c8SBarry Smith }
494c1414c8SBarry Smith 
504c1414c8SBarry Smith /*
514c1414c8SBarry Smith       This builds symmetric version of nonzero structure,
524c1414c8SBarry Smith */
53d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54d71ae5a4SJacob Faibussowitsch {
554c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
568758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
57*4d12350bSJunchao Zhang   PetscInt       *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
588758e1faSBarry Smith   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
594c1414c8SBarry Smith 
604c1414c8SBarry Smith   PetscFunctionBegin;
614c1414c8SBarry Smith   nslim_row = a->inode.node_count;
62d0f46423SBarry Smith   m         = A->rmap->n;
63d0f46423SBarry Smith   n         = A->cmap->n;
6408401ef6SPierre Jolivet   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
65*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
664c1414c8SBarry Smith 
674c1414c8SBarry Smith   /* Use the row_inode as column_inode */
684c1414c8SBarry Smith   nslim_col = nslim_row;
694c1414c8SBarry Smith   ns_col    = ns_row;
704c1414c8SBarry Smith 
7135cb6cd3SPierre Jolivet   /* allocate space for reformatted inode structure */
729566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
73*4d12350bSJunchao Zhang   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
744c1414c8SBarry Smith 
754c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
76*4d12350bSJunchao Zhang     nsz = ns_col[i1 + 1] - ns_col[i1];
772205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
784c1414c8SBarry Smith   }
794c1414c8SBarry Smith   /* allocate space for row pointers */
809566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
814c1414c8SBarry Smith   *iia = ia;
829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
834c1414c8SBarry Smith 
844c1414c8SBarry Smith   /* determine the number of columns in each row */
854c1414c8SBarry Smith   ia[0] = oshift;
86*4d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
87*4d12350bSJunchao Zhang     row  = ns_row[i1];
884c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
894c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
9083fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
914c1414c8SBarry Smith     col = *j++ + ishift;
924c1414c8SBarry Smith     i2  = tvc[col];
936aad120cSJose E. Roman     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
944c1414c8SBarry Smith       ia[i1 + 1]++;
954c1414c8SBarry Smith       ia[i2 + 1]++;
964c1414c8SBarry Smith       i2++; /* Start col of next node */
9790d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
984c1414c8SBarry Smith       i2 = tvc[col];
994c1414c8SBarry Smith     }
1004c1414c8SBarry Smith     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
1014c1414c8SBarry Smith   }
1024c1414c8SBarry Smith 
1034c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1044c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1054c1414c8SBarry Smith     row = ia[i1 - 1];
1064c1414c8SBarry Smith     ia[i1] += row;
1074c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1084c1414c8SBarry Smith   }
1094c1414c8SBarry Smith 
1104c1414c8SBarry Smith   /* allocate space for column pointers */
1114c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1129566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1134c1414c8SBarry Smith   *jja = ja;
1144c1414c8SBarry Smith 
1154c1414c8SBarry Smith   /* loop over lower triangular part putting into ja */
116*4d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
117*4d12350bSJunchao Zhang     row  = ns_row[i1];
1184c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
1194c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
12083fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
1214c1414c8SBarry Smith     col = *j++ + ishift;
1224c1414c8SBarry Smith     i2  = tvc[col];
1234c1414c8SBarry Smith     while (i2 < i1 && j < jmax) {
1244c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
1254c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
1264c1414c8SBarry Smith       ++i2;
12790d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
1284c1414c8SBarry Smith       i2 = tvc[col];
1294c1414c8SBarry Smith     }
1304c1414c8SBarry Smith     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
1314c1414c8SBarry Smith   }
1329566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
1339566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
1343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
1354c1414c8SBarry Smith }
1364c1414c8SBarry Smith 
1374c1414c8SBarry Smith /*
1384c1414c8SBarry Smith       This builds nonsymmetric version of nonzero structure,
1394c1414c8SBarry Smith */
140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141d71ae5a4SJacob Faibussowitsch {
1424c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
1438758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
1448758e1faSBarry Smith   PetscInt       *tns, *tvc, nsz, i1, i2;
145*4d12350bSJunchao Zhang   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
1464c1414c8SBarry Smith 
1474c1414c8SBarry Smith   PetscFunctionBegin;
148*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1494c1414c8SBarry Smith   nslim_row = a->inode.node_count;
150d0f46423SBarry Smith   n         = A->cmap->n;
1514c1414c8SBarry Smith 
1524c1414c8SBarry Smith   /* Create The column_inode for this matrix */
1539566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
1544c1414c8SBarry Smith 
15535cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
1569566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
157*4d12350bSJunchao Zhang   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
1584c1414c8SBarry Smith 
1594c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
160*4d12350bSJunchao Zhang     nsz = ns_col[i1 + 1] - ns_col[i1];
1612205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
1624c1414c8SBarry Smith   }
1634c1414c8SBarry Smith   /* allocate space for row pointers */
1649566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
1654c1414c8SBarry Smith   *iia = ia;
1669566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
1674c1414c8SBarry Smith 
1684c1414c8SBarry Smith   /* determine the number of columns in each row */
1694c1414c8SBarry Smith   ia[0] = oshift;
170*4d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
171*4d12350bSJunchao Zhang     row = ns_row[i1];
1724c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
17383fed2edSSatish Balay     nz  = ai[row + 1] - ai[row];
17483fed2edSSatish Balay     if (!nz) continue; /* empty row */
1754c1414c8SBarry Smith     col = *j++ + ishift;
1764c1414c8SBarry Smith     i2  = tvc[col];
1776aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
1784c1414c8SBarry Smith       ia[i1 + 1]++;
1794c1414c8SBarry Smith       i2++; /* Start col of next node */
180a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
1814c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
1824c1414c8SBarry Smith     }
1834c1414c8SBarry Smith   }
1844c1414c8SBarry Smith 
1854c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1864c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1874c1414c8SBarry Smith     row = ia[i1 - 1];
1884c1414c8SBarry Smith     ia[i1] += row;
1894c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1904c1414c8SBarry Smith   }
1914c1414c8SBarry Smith 
1924c1414c8SBarry Smith   /* allocate space for column pointers */
1934c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1949566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1954c1414c8SBarry Smith   *jja = ja;
1964c1414c8SBarry Smith 
1974c1414c8SBarry Smith   /* loop over matrix putting into ja */
198*4d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
199*4d12350bSJunchao Zhang     row = ns_row[i1];
2004c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
20183fed2edSSatish Balay     nz  = ai[row + 1] - ai[row];
20283fed2edSSatish Balay     if (!nz) continue; /* empty row */
2034c1414c8SBarry Smith     col = *j++ + ishift;
2044c1414c8SBarry Smith     i2  = tvc[col];
2054c1414c8SBarry Smith     while (nz-- > 0) {
2064c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
2074c1414c8SBarry Smith       ++i2;
208a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2094c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2104c1414c8SBarry Smith     }
2114c1414c8SBarry Smith   }
2129566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
2139566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
2149566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
2153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2164c1414c8SBarry Smith }
2174c1414c8SBarry Smith 
218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219d71ae5a4SJacob Faibussowitsch {
2204c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2214c1414c8SBarry Smith 
2224c1414c8SBarry Smith   PetscFunctionBegin;
22350ba90b4SBarry Smith   if (n) *n = a->inode.node_count;
2243ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2258f7157efSSatish Balay   if (!blockcompressed) {
2269566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2278f7157efSSatish Balay   } else if (symmetric) {
2289566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
2294c1414c8SBarry Smith   } else {
2309566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
2314c1414c8SBarry Smith   }
2323ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2334c1414c8SBarry Smith }
2344c1414c8SBarry Smith 
235d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236d71ae5a4SJacob Faibussowitsch {
2374c1414c8SBarry Smith   PetscFunctionBegin;
2383ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2398f7157efSSatish Balay 
2408f7157efSSatish Balay   if (!blockcompressed) {
2419566063dSJacob Faibussowitsch     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2428f7157efSSatish Balay   } else {
2439566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
2449566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
2458f7157efSSatish Balay   }
2463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2474c1414c8SBarry Smith }
2484c1414c8SBarry Smith 
249d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250d71ae5a4SJacob Faibussowitsch {
2514c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2524c1414c8SBarry Smith   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
253*4d12350bSJunchao Zhang   PetscInt   *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
2544c1414c8SBarry Smith 
2554c1414c8SBarry Smith   PetscFunctionBegin;
256*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2574c1414c8SBarry Smith   nslim_row = a->inode.node_count;
258d0f46423SBarry Smith   n         = A->cmap->n;
2594c1414c8SBarry Smith 
2604c1414c8SBarry Smith   /* Create The column_inode for this matrix */
2619566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
2624c1414c8SBarry Smith 
26335cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
2649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
265*4d12350bSJunchao Zhang   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
2664c1414c8SBarry Smith 
2674c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
268*4d12350bSJunchao Zhang     nsz = ns_col[i1 + 1] - ns_col[i1];
2692205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
2704c1414c8SBarry Smith   }
2714c1414c8SBarry Smith   /* allocate space for column pointers */
2729566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
2734c1414c8SBarry Smith   *iia = ia;
2749566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_col + 1, &work));
2754c1414c8SBarry Smith 
2764c1414c8SBarry Smith   /* determine the number of columns in each row */
2774c1414c8SBarry Smith   ia[0] = oshift;
278*4d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
279*4d12350bSJunchao Zhang     row = ns_row[i1];
2804c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
2814c1414c8SBarry Smith     col = *j++ + ishift;
2824c1414c8SBarry Smith     i2  = tvc[col];
2834c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
2846aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
2854c1414c8SBarry Smith       /* ia[i1+1]++; */
2864c1414c8SBarry Smith       ia[i2 + 1]++;
2874c1414c8SBarry Smith       i2++;
288a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2894c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2904c1414c8SBarry Smith     }
2914c1414c8SBarry Smith   }
2924c1414c8SBarry Smith 
2934c1414c8SBarry Smith   /* shift ia[i] to point to next col */
2944c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_col + 1; i1++) {
2954c1414c8SBarry Smith     col = ia[i1 - 1];
2964c1414c8SBarry Smith     ia[i1] += col;
2974c1414c8SBarry Smith     work[i1 - 1] = col - oshift;
2984c1414c8SBarry Smith   }
2994c1414c8SBarry Smith 
3004c1414c8SBarry Smith   /* allocate space for column pointers */
3014c1414c8SBarry Smith   nz = ia[nslim_col] + (!ishift);
3029566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
3034c1414c8SBarry Smith   *jja = ja;
3044c1414c8SBarry Smith 
3054c1414c8SBarry Smith   /* loop over matrix putting into ja */
306*4d12350bSJunchao Zhang   for (i1 = 0; i1 < nslim_row; i1++) {
307*4d12350bSJunchao Zhang     row = ns_row[i1];
3084c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
3094c1414c8SBarry Smith     col = *j++ + ishift;
3104c1414c8SBarry Smith     i2  = tvc[col];
3114c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
3124c1414c8SBarry Smith     while (nz-- > 0) {
3134c1414c8SBarry Smith       /* ja[work[i1]++] = i2 + oshift; */
3144c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
3154c1414c8SBarry Smith       i2++;
316a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
3174c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
3184c1414c8SBarry Smith     }
3194c1414c8SBarry Smith   }
3209566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
3219566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
3229566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
3233ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3244c1414c8SBarry Smith }
3254c1414c8SBarry Smith 
326d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327d71ae5a4SJacob Faibussowitsch {
3284c1414c8SBarry Smith   PetscFunctionBegin;
3299566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, n, NULL));
3303ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3314c1414c8SBarry Smith 
3328f7157efSSatish Balay   if (!blockcompressed) {
3339566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3348f7157efSSatish Balay   } else if (symmetric) {
335a5b23f4aSJose E. Roman     /* Since the indices are symmetric it doesn't matter */
3369566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
3374c1414c8SBarry Smith   } else {
3389566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
3394c1414c8SBarry Smith   }
3403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3414c1414c8SBarry Smith }
3424c1414c8SBarry Smith 
343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344d71ae5a4SJacob Faibussowitsch {
3454c1414c8SBarry Smith   PetscFunctionBegin;
3463ba16761SJacob Faibussowitsch   if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3478f7157efSSatish Balay   if (!blockcompressed) {
3489566063dSJacob Faibussowitsch     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3498f7157efSSatish Balay   } else {
3509566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
3519566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
3528f7157efSSatish Balay   }
3533ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3544c1414c8SBarry Smith }
3554c1414c8SBarry Smith 
356d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357d71ae5a4SJacob Faibussowitsch {
3584c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
3594c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
360d9fead3dSBarry Smith   PetscScalar       *y;
361dd6ea824SBarry Smith   const PetscScalar *x;
362dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
3638758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
3648758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
3654c1414c8SBarry Smith 
3664c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
3674c1414c8SBarry Smith   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
3684c1414c8SBarry Smith #endif
3694c1414c8SBarry Smith 
3704c1414c8SBarry Smith   PetscFunctionBegin;
371*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3724c1414c8SBarry Smith   node_max = a->inode.node_count;
373*4d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
3749566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3759566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
3764c1414c8SBarry Smith   idx = a->j;
3774c1414c8SBarry Smith   v1  = a->a;
3784c1414c8SBarry Smith   ii  = a->i;
3794c1414c8SBarry Smith 
3804c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
381*4d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
3824c1414c8SBarry Smith     n   = ii[1] - ii[0];
38398c9bda7SSatish Balay     nonzerorow += (n > 0) * nsz;
3844c1414c8SBarry Smith     ii += nsz;
38550d8bf02SJed Brown     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
38650d8bf02SJed Brown     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
3874c1414c8SBarry Smith     sz = n;                                                                /* No of non zeros in this row */
3884c1414c8SBarry Smith                                                                            /* Switch on the size of Node */
3894c1414c8SBarry Smith     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
3904c1414c8SBarry Smith     case 1:
39175567043SBarry Smith       sum1 = 0.;
3924c1414c8SBarry Smith 
3934c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
3944c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
3954c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
3964c1414c8SBarry Smith         idx += 2;
3974c1414c8SBarry Smith         tmp0 = x[i1];
3984c1414c8SBarry Smith         tmp1 = x[i2];
3999371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4009371c9d4SSatish Balay         v1 += 2;
4014c1414c8SBarry Smith       }
4024c1414c8SBarry Smith 
4034c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
4044c1414c8SBarry Smith         tmp0 = x[*idx++];
4054c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4064c1414c8SBarry Smith       }
4074c1414c8SBarry Smith       y[row++] = sum1;
4084c1414c8SBarry Smith       break;
4094c1414c8SBarry Smith     case 2:
41075567043SBarry Smith       sum1 = 0.;
41175567043SBarry Smith       sum2 = 0.;
4124c1414c8SBarry Smith       v2   = v1 + n;
4134c1414c8SBarry Smith 
4144c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4154c1414c8SBarry Smith         i1 = idx[0];
4164c1414c8SBarry Smith         i2 = idx[1];
4174c1414c8SBarry Smith         idx += 2;
4184c1414c8SBarry Smith         tmp0 = x[i1];
4194c1414c8SBarry Smith         tmp1 = x[i2];
4209371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4219371c9d4SSatish Balay         v1 += 2;
4229371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4239371c9d4SSatish Balay         v2 += 2;
4244c1414c8SBarry Smith       }
4254c1414c8SBarry Smith       if (n == sz - 1) {
4264c1414c8SBarry Smith         tmp0 = x[*idx++];
4274c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4284c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4294c1414c8SBarry Smith       }
4304c1414c8SBarry Smith       y[row++] = sum1;
4314c1414c8SBarry Smith       y[row++] = sum2;
4324c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
4334c1414c8SBarry Smith       idx += sz;
4344c1414c8SBarry Smith       break;
4354c1414c8SBarry Smith     case 3:
43675567043SBarry Smith       sum1 = 0.;
43775567043SBarry Smith       sum2 = 0.;
43875567043SBarry Smith       sum3 = 0.;
4394c1414c8SBarry Smith       v2   = v1 + n;
4404c1414c8SBarry Smith       v3   = v2 + n;
4414c1414c8SBarry Smith 
4424c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4434c1414c8SBarry Smith         i1 = idx[0];
4444c1414c8SBarry Smith         i2 = idx[1];
4454c1414c8SBarry Smith         idx += 2;
4464c1414c8SBarry Smith         tmp0 = x[i1];
4474c1414c8SBarry Smith         tmp1 = x[i2];
4489371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4499371c9d4SSatish Balay         v1 += 2;
4509371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4519371c9d4SSatish Balay         v2 += 2;
4529371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4539371c9d4SSatish Balay         v3 += 2;
4544c1414c8SBarry Smith       }
4554c1414c8SBarry Smith       if (n == sz - 1) {
4564c1414c8SBarry Smith         tmp0 = x[*idx++];
4574c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4584c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4594c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4604c1414c8SBarry Smith       }
4614c1414c8SBarry Smith       y[row++] = sum1;
4624c1414c8SBarry Smith       y[row++] = sum2;
4634c1414c8SBarry Smith       y[row++] = sum3;
4644c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
4654c1414c8SBarry Smith       idx += 2 * sz;
4664c1414c8SBarry Smith       break;
4674c1414c8SBarry Smith     case 4:
46875567043SBarry Smith       sum1 = 0.;
46975567043SBarry Smith       sum2 = 0.;
47075567043SBarry Smith       sum3 = 0.;
47175567043SBarry Smith       sum4 = 0.;
4724c1414c8SBarry Smith       v2   = v1 + n;
4734c1414c8SBarry Smith       v3   = v2 + n;
4744c1414c8SBarry Smith       v4   = v3 + n;
4754c1414c8SBarry Smith 
4764c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4774c1414c8SBarry Smith         i1 = idx[0];
4784c1414c8SBarry Smith         i2 = idx[1];
4794c1414c8SBarry Smith         idx += 2;
4804c1414c8SBarry Smith         tmp0 = x[i1];
4814c1414c8SBarry Smith         tmp1 = x[i2];
4829371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4839371c9d4SSatish Balay         v1 += 2;
4849371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4859371c9d4SSatish Balay         v2 += 2;
4869371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4879371c9d4SSatish Balay         v3 += 2;
4889371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
4899371c9d4SSatish Balay         v4 += 2;
4904c1414c8SBarry Smith       }
4914c1414c8SBarry Smith       if (n == sz - 1) {
4924c1414c8SBarry Smith         tmp0 = x[*idx++];
4934c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4944c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4954c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4964c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
4974c1414c8SBarry Smith       }
4984c1414c8SBarry Smith       y[row++] = sum1;
4994c1414c8SBarry Smith       y[row++] = sum2;
5004c1414c8SBarry Smith       y[row++] = sum3;
5014c1414c8SBarry Smith       y[row++] = sum4;
5024c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
5034c1414c8SBarry Smith       idx += 3 * sz;
5044c1414c8SBarry Smith       break;
5054c1414c8SBarry Smith     case 5:
50675567043SBarry Smith       sum1 = 0.;
50775567043SBarry Smith       sum2 = 0.;
50875567043SBarry Smith       sum3 = 0.;
50975567043SBarry Smith       sum4 = 0.;
51075567043SBarry Smith       sum5 = 0.;
5114c1414c8SBarry Smith       v2   = v1 + n;
5124c1414c8SBarry Smith       v3   = v2 + n;
5134c1414c8SBarry Smith       v4   = v3 + n;
5144c1414c8SBarry Smith       v5   = v4 + n;
5154c1414c8SBarry Smith 
5164c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5174c1414c8SBarry Smith         i1 = idx[0];
5184c1414c8SBarry Smith         i2 = idx[1];
5194c1414c8SBarry Smith         idx += 2;
5204c1414c8SBarry Smith         tmp0 = x[i1];
5214c1414c8SBarry Smith         tmp1 = x[i2];
5229371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5239371c9d4SSatish Balay         v1 += 2;
5249371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
5259371c9d4SSatish Balay         v2 += 2;
5269371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
5279371c9d4SSatish Balay         v3 += 2;
5289371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
5299371c9d4SSatish Balay         v4 += 2;
5309371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
5319371c9d4SSatish Balay         v5 += 2;
5324c1414c8SBarry Smith       }
5334c1414c8SBarry Smith       if (n == sz - 1) {
5344c1414c8SBarry Smith         tmp0 = x[*idx++];
5354c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
5364c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
5374c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
5384c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
5394c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
5404c1414c8SBarry Smith       }
5414c1414c8SBarry Smith       y[row++] = sum1;
5424c1414c8SBarry Smith       y[row++] = sum2;
5434c1414c8SBarry Smith       y[row++] = sum3;
5444c1414c8SBarry Smith       y[row++] = sum4;
5454c1414c8SBarry Smith       y[row++] = sum5;
5464c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
5474c1414c8SBarry Smith       idx += 4 * sz;
5484c1414c8SBarry Smith       break;
549d71ae5a4SJacob Faibussowitsch     default:
5500c335700SBarry Smith       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
5514c1414c8SBarry Smith     }
5524c1414c8SBarry Smith   }
5539566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5549566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
5559566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
5563ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5574c1414c8SBarry Smith }
5582ef1f0ffSBarry Smith 
5594108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */
560d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
561d71ae5a4SJacob Faibussowitsch {
5624c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
5634c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
5648758e1faSBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
5658758e1faSBarry Smith   const PetscScalar *x;
5668758e1faSBarry Smith   PetscScalar       *y, *z, *zt;
5678758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
5688758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
5694c1414c8SBarry Smith 
5704c1414c8SBarry Smith   PetscFunctionBegin;
571*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
5724c1414c8SBarry Smith   node_max = a->inode.node_count;
573*4d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
5742205254eSKarl Rupp 
5759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
5774c1414c8SBarry Smith   zt = z;
5784c1414c8SBarry Smith 
5794c1414c8SBarry Smith   idx = a->j;
5804c1414c8SBarry Smith   v1  = a->a;
5814c1414c8SBarry Smith   ii  = a->i;
5824c1414c8SBarry Smith 
583*4d12350bSJunchao Zhang   for (i = 0; i < node_max; ++i) {
584*4d12350bSJunchao Zhang     row = ns[i];
585*4d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
5864c1414c8SBarry Smith     n   = ii[1] - ii[0];
5874c1414c8SBarry Smith     ii += nsz;
5884c1414c8SBarry Smith     sz = n;        /* No of non zeros in this row */
5894c1414c8SBarry Smith                    /* Switch on the size of Node */
5904c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
5914c1414c8SBarry Smith     case 1:
5924c1414c8SBarry Smith       sum1 = *zt++;
5934c1414c8SBarry Smith 
5944c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5954c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
5964c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
5974c1414c8SBarry Smith         idx += 2;
5984c1414c8SBarry Smith         tmp0 = x[i1];
5994c1414c8SBarry Smith         tmp1 = x[i2];
6009371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6019371c9d4SSatish Balay         v1 += 2;
6024c1414c8SBarry Smith       }
6034c1414c8SBarry Smith 
6044c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
6054c1414c8SBarry Smith         tmp0 = x[*idx++];
6064c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6074c1414c8SBarry Smith       }
6084c1414c8SBarry Smith       y[row++] = sum1;
6094c1414c8SBarry Smith       break;
6104c1414c8SBarry Smith     case 2:
6114c1414c8SBarry Smith       sum1 = *zt++;
6124c1414c8SBarry Smith       sum2 = *zt++;
6134c1414c8SBarry Smith       v2   = v1 + n;
6144c1414c8SBarry Smith 
6154c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6164c1414c8SBarry Smith         i1 = idx[0];
6174c1414c8SBarry Smith         i2 = idx[1];
6184c1414c8SBarry Smith         idx += 2;
6194c1414c8SBarry Smith         tmp0 = x[i1];
6204c1414c8SBarry Smith         tmp1 = x[i2];
6219371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6229371c9d4SSatish Balay         v1 += 2;
6239371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6249371c9d4SSatish Balay         v2 += 2;
6254c1414c8SBarry Smith       }
6264c1414c8SBarry Smith       if (n == sz - 1) {
6274c1414c8SBarry Smith         tmp0 = x[*idx++];
6284c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6294c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6304c1414c8SBarry Smith       }
6314c1414c8SBarry Smith       y[row++] = sum1;
6324c1414c8SBarry Smith       y[row++] = sum2;
6334c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
6344c1414c8SBarry Smith       idx += sz;
6354c1414c8SBarry Smith       break;
6364c1414c8SBarry Smith     case 3:
6374c1414c8SBarry Smith       sum1 = *zt++;
6384c1414c8SBarry Smith       sum2 = *zt++;
6394c1414c8SBarry Smith       sum3 = *zt++;
6404c1414c8SBarry Smith       v2   = v1 + n;
6414c1414c8SBarry Smith       v3   = v2 + n;
6424c1414c8SBarry Smith 
6434c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6444c1414c8SBarry Smith         i1 = idx[0];
6454c1414c8SBarry Smith         i2 = idx[1];
6464c1414c8SBarry Smith         idx += 2;
6474c1414c8SBarry Smith         tmp0 = x[i1];
6484c1414c8SBarry Smith         tmp1 = x[i2];
6499371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6509371c9d4SSatish Balay         v1 += 2;
6519371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6529371c9d4SSatish Balay         v2 += 2;
6539371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6549371c9d4SSatish Balay         v3 += 2;
6554c1414c8SBarry Smith       }
6564c1414c8SBarry Smith       if (n == sz - 1) {
6574c1414c8SBarry Smith         tmp0 = x[*idx++];
6584c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6594c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6604c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6614c1414c8SBarry Smith       }
6624c1414c8SBarry Smith       y[row++] = sum1;
6634c1414c8SBarry Smith       y[row++] = sum2;
6644c1414c8SBarry Smith       y[row++] = sum3;
6654c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
6664c1414c8SBarry Smith       idx += 2 * sz;
6674c1414c8SBarry Smith       break;
6684c1414c8SBarry Smith     case 4:
6694c1414c8SBarry Smith       sum1 = *zt++;
6704c1414c8SBarry Smith       sum2 = *zt++;
6714c1414c8SBarry Smith       sum3 = *zt++;
6724c1414c8SBarry Smith       sum4 = *zt++;
6734c1414c8SBarry Smith       v2   = v1 + n;
6744c1414c8SBarry Smith       v3   = v2 + n;
6754c1414c8SBarry Smith       v4   = v3 + n;
6764c1414c8SBarry Smith 
6774c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6784c1414c8SBarry Smith         i1 = idx[0];
6794c1414c8SBarry Smith         i2 = idx[1];
6804c1414c8SBarry Smith         idx += 2;
6814c1414c8SBarry Smith         tmp0 = x[i1];
6824c1414c8SBarry Smith         tmp1 = x[i2];
6839371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6849371c9d4SSatish Balay         v1 += 2;
6859371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6869371c9d4SSatish Balay         v2 += 2;
6879371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6889371c9d4SSatish Balay         v3 += 2;
6899371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
6909371c9d4SSatish Balay         v4 += 2;
6914c1414c8SBarry Smith       }
6924c1414c8SBarry Smith       if (n == sz - 1) {
6934c1414c8SBarry Smith         tmp0 = x[*idx++];
6944c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6954c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6964c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6974c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
6984c1414c8SBarry Smith       }
6994c1414c8SBarry Smith       y[row++] = sum1;
7004c1414c8SBarry Smith       y[row++] = sum2;
7014c1414c8SBarry Smith       y[row++] = sum3;
7024c1414c8SBarry Smith       y[row++] = sum4;
7034c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
7044c1414c8SBarry Smith       idx += 3 * sz;
7054c1414c8SBarry Smith       break;
7064c1414c8SBarry Smith     case 5:
7074c1414c8SBarry Smith       sum1 = *zt++;
7084c1414c8SBarry Smith       sum2 = *zt++;
7094c1414c8SBarry Smith       sum3 = *zt++;
7104c1414c8SBarry Smith       sum4 = *zt++;
7114c1414c8SBarry Smith       sum5 = *zt++;
7124c1414c8SBarry Smith       v2   = v1 + n;
7134c1414c8SBarry Smith       v3   = v2 + n;
7144c1414c8SBarry Smith       v4   = v3 + n;
7154c1414c8SBarry Smith       v5   = v4 + n;
7164c1414c8SBarry Smith 
7174c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
7184c1414c8SBarry Smith         i1 = idx[0];
7194c1414c8SBarry Smith         i2 = idx[1];
7204c1414c8SBarry Smith         idx += 2;
7214c1414c8SBarry Smith         tmp0 = x[i1];
7224c1414c8SBarry Smith         tmp1 = x[i2];
7239371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
7249371c9d4SSatish Balay         v1 += 2;
7259371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
7269371c9d4SSatish Balay         v2 += 2;
7279371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
7289371c9d4SSatish Balay         v3 += 2;
7299371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
7309371c9d4SSatish Balay         v4 += 2;
7319371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
7329371c9d4SSatish Balay         v5 += 2;
7334c1414c8SBarry Smith       }
7344c1414c8SBarry Smith       if (n == sz - 1) {
7354c1414c8SBarry Smith         tmp0 = x[*idx++];
7364c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
7374c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
7384c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
7394c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
7404c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
7414c1414c8SBarry Smith       }
7424c1414c8SBarry Smith       y[row++] = sum1;
7434c1414c8SBarry Smith       y[row++] = sum2;
7444c1414c8SBarry Smith       y[row++] = sum3;
7454c1414c8SBarry Smith       y[row++] = sum4;
7464c1414c8SBarry Smith       y[row++] = sum5;
7474c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
7484c1414c8SBarry Smith       idx += 4 * sz;
7494c1414c8SBarry Smith       break;
750d71ae5a4SJacob Faibussowitsch     default:
751d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
7524c1414c8SBarry Smith     }
7534c1414c8SBarry Smith   }
7549566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
7569566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
7573ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
7584c1414c8SBarry Smith }
7594c1414c8SBarry Smith 
760ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
761d71ae5a4SJacob Faibussowitsch {
7624c1414c8SBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
7634c1414c8SBarry Smith   IS                 iscol = a->col, isrow = a->row;
7645d0c19d7SBarry Smith   const PetscInt    *r, *c, *rout, *cout;
7658758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n, nz;
7668758e1faSBarry Smith   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
7678758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
768d9fead3dSBarry Smith   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
769d9fead3dSBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5;
770dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
771dd6ea824SBarry Smith   const PetscScalar *b;
7724c1414c8SBarry Smith 
7734c1414c8SBarry Smith   PetscFunctionBegin;
774*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
7754c1414c8SBarry Smith   node_max = a->inode.node_count;
776*4d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
7774c1414c8SBarry Smith 
7789566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7799566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
7804c1414c8SBarry Smith   tmp = a->solve_work;
7814c1414c8SBarry Smith 
7829371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7839371c9d4SSatish Balay   r = rout;
7849371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7859371c9d4SSatish Balay   c = cout + (n - 1);
7864c1414c8SBarry Smith 
7874c1414c8SBarry Smith   /* forward solve the lower triangular */
7884c1414c8SBarry Smith   tmps = tmp;
7894c1414c8SBarry Smith   aa   = a_a;
7904c1414c8SBarry Smith   aj   = a_j;
7914c1414c8SBarry Smith   ad   = a->diag;
7924c1414c8SBarry Smith 
7934c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
794*4d12350bSJunchao Zhang     row = ns[i];
795*4d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
7964c1414c8SBarry Smith     aii = ai[row];
7974c1414c8SBarry Smith     v1  = aa + aii;
7984c1414c8SBarry Smith     vi  = aj + aii;
7994c1414c8SBarry Smith     nz  = ad[row] - aii;
80026549573SJed Brown     if (i < node_max - 1) {
80126549573SJed Brown       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
80291c35059SPierre Jolivet       * but our indexing to determine its size could. */
80350d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
80426549573SJed Brown       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
805*4d12350bSJunchao Zhang       PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
80626549573SJed Brown       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
80726549573SJed Brown     }
8084c1414c8SBarry Smith 
8094c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
8104c1414c8SBarry Smith     case 1:
8114c1414c8SBarry Smith       sum1 = b[*r++];
8124c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8134c1414c8SBarry Smith         i0 = vi[0];
8144c1414c8SBarry Smith         i1 = vi[1];
8154c1414c8SBarry Smith         vi += 2;
8164c1414c8SBarry Smith         tmp0 = tmps[i0];
8174c1414c8SBarry Smith         tmp1 = tmps[i1];
8189371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8199371c9d4SSatish Balay         v1 += 2;
8204c1414c8SBarry Smith       }
8214c1414c8SBarry Smith       if (j == nz - 1) {
8224c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8234c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8244c1414c8SBarry Smith       }
8254c1414c8SBarry Smith       tmp[row++] = sum1;
8264c1414c8SBarry Smith       break;
8274c1414c8SBarry Smith     case 2:
8284c1414c8SBarry Smith       sum1 = b[*r++];
8294c1414c8SBarry Smith       sum2 = b[*r++];
8304c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8314c1414c8SBarry Smith 
8324c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8334c1414c8SBarry Smith         i0 = vi[0];
8344c1414c8SBarry Smith         i1 = vi[1];
8354c1414c8SBarry Smith         vi += 2;
8364c1414c8SBarry Smith         tmp0 = tmps[i0];
8374c1414c8SBarry Smith         tmp1 = tmps[i1];
8389371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8399371c9d4SSatish Balay         v1 += 2;
8409371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8419371c9d4SSatish Balay         v2 += 2;
8424c1414c8SBarry Smith       }
8434c1414c8SBarry Smith       if (j == nz - 1) {
8444c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8454c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8464c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8474c1414c8SBarry Smith       }
8484c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8494c1414c8SBarry Smith       tmp[row++] = sum1;
8504c1414c8SBarry Smith       tmp[row++] = sum2;
8514c1414c8SBarry Smith       break;
8524c1414c8SBarry Smith     case 3:
8534c1414c8SBarry Smith       sum1 = b[*r++];
8544c1414c8SBarry Smith       sum2 = b[*r++];
8554c1414c8SBarry Smith       sum3 = b[*r++];
8564c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8574c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8584c1414c8SBarry Smith 
8594c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8604c1414c8SBarry Smith         i0 = vi[0];
8614c1414c8SBarry Smith         i1 = vi[1];
8624c1414c8SBarry Smith         vi += 2;
8634c1414c8SBarry Smith         tmp0 = tmps[i0];
8644c1414c8SBarry Smith         tmp1 = tmps[i1];
8659371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8669371c9d4SSatish Balay         v1 += 2;
8679371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8689371c9d4SSatish Balay         v2 += 2;
8699371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8709371c9d4SSatish Balay         v3 += 2;
8714c1414c8SBarry Smith       }
8724c1414c8SBarry Smith       if (j == nz - 1) {
8734c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8744c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8754c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8764c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
8774c1414c8SBarry Smith       }
8784c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8794c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
8804c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
8812205254eSKarl Rupp 
8824c1414c8SBarry Smith       tmp[row++] = sum1;
8834c1414c8SBarry Smith       tmp[row++] = sum2;
8844c1414c8SBarry Smith       tmp[row++] = sum3;
8854c1414c8SBarry Smith       break;
8864c1414c8SBarry Smith 
8874c1414c8SBarry Smith     case 4:
8884c1414c8SBarry Smith       sum1 = b[*r++];
8894c1414c8SBarry Smith       sum2 = b[*r++];
8904c1414c8SBarry Smith       sum3 = b[*r++];
8914c1414c8SBarry Smith       sum4 = b[*r++];
8924c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8934c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8944c1414c8SBarry Smith       v4   = aa + ai[row + 3];
8954c1414c8SBarry Smith 
8964c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8974c1414c8SBarry Smith         i0 = vi[0];
8984c1414c8SBarry Smith         i1 = vi[1];
8994c1414c8SBarry Smith         vi += 2;
9004c1414c8SBarry Smith         tmp0 = tmps[i0];
9014c1414c8SBarry Smith         tmp1 = tmps[i1];
9029371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9039371c9d4SSatish Balay         v1 += 2;
9049371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9059371c9d4SSatish Balay         v2 += 2;
9069371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9079371c9d4SSatish Balay         v3 += 2;
9089371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9099371c9d4SSatish Balay         v4 += 2;
9104c1414c8SBarry Smith       }
9114c1414c8SBarry Smith       if (j == nz - 1) {
9124c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9134c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9144c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9154c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9164c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9174c1414c8SBarry Smith       }
9184c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9194c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9204c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9214c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9224c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9234c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9244c1414c8SBarry Smith 
9254c1414c8SBarry Smith       tmp[row++] = sum1;
9264c1414c8SBarry Smith       tmp[row++] = sum2;
9274c1414c8SBarry Smith       tmp[row++] = sum3;
9284c1414c8SBarry Smith       tmp[row++] = sum4;
9294c1414c8SBarry Smith       break;
9304c1414c8SBarry Smith     case 5:
9314c1414c8SBarry Smith       sum1 = b[*r++];
9324c1414c8SBarry Smith       sum2 = b[*r++];
9334c1414c8SBarry Smith       sum3 = b[*r++];
9344c1414c8SBarry Smith       sum4 = b[*r++];
9354c1414c8SBarry Smith       sum5 = b[*r++];
9364c1414c8SBarry Smith       v2   = aa + ai[row + 1];
9374c1414c8SBarry Smith       v3   = aa + ai[row + 2];
9384c1414c8SBarry Smith       v4   = aa + ai[row + 3];
9394c1414c8SBarry Smith       v5   = aa + ai[row + 4];
9404c1414c8SBarry Smith 
9414c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
9424c1414c8SBarry Smith         i0 = vi[0];
9434c1414c8SBarry Smith         i1 = vi[1];
9444c1414c8SBarry Smith         vi += 2;
9454c1414c8SBarry Smith         tmp0 = tmps[i0];
9464c1414c8SBarry Smith         tmp1 = tmps[i1];
9479371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9489371c9d4SSatish Balay         v1 += 2;
9499371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9509371c9d4SSatish Balay         v2 += 2;
9519371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9529371c9d4SSatish Balay         v3 += 2;
9539371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9549371c9d4SSatish Balay         v4 += 2;
9559371c9d4SSatish Balay         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
9569371c9d4SSatish Balay         v5 += 2;
9574c1414c8SBarry Smith       }
9584c1414c8SBarry Smith       if (j == nz - 1) {
9594c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9604c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9614c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9624c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9634c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9644c1414c8SBarry Smith         sum5 -= *v5++ * tmp0;
9654c1414c8SBarry Smith       }
9664c1414c8SBarry Smith 
9674c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9684c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9694c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9704c1414c8SBarry Smith       sum5 -= *v5++ * sum1;
9714c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9724c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9734c1414c8SBarry Smith       sum5 -= *v5++ * sum2;
9744c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9754c1414c8SBarry Smith       sum5 -= *v5++ * sum3;
9764c1414c8SBarry Smith       sum5 -= *v5++ * sum4;
9774c1414c8SBarry Smith 
9784c1414c8SBarry Smith       tmp[row++] = sum1;
9794c1414c8SBarry Smith       tmp[row++] = sum2;
9804c1414c8SBarry Smith       tmp[row++] = sum3;
9814c1414c8SBarry Smith       tmp[row++] = sum4;
9824c1414c8SBarry Smith       tmp[row++] = sum5;
9834c1414c8SBarry Smith       break;
984d71ae5a4SJacob Faibussowitsch     default:
985d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
9864c1414c8SBarry Smith     }
9874c1414c8SBarry Smith   }
9884c1414c8SBarry Smith   /* backward solve the upper triangular */
989*4d12350bSJunchao Zhang   for (i = node_max - 1; i >= 0; i--) {
990*4d12350bSJunchao Zhang     row = ns[i + 1];
991*4d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
9924c1414c8SBarry Smith     aii = ai[row + 1] - 1;
9934c1414c8SBarry Smith     v1  = aa + aii;
9944c1414c8SBarry Smith     vi  = aj + aii;
9954c1414c8SBarry Smith     nz  = aii - ad[row];
9964c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
9974c1414c8SBarry Smith     case 1:
9984c1414c8SBarry Smith       sum1 = tmp[row];
9994c1414c8SBarry Smith 
10004c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10014c1414c8SBarry Smith         vi -= 2;
10024c1414c8SBarry Smith         i0   = vi[2];
10034c1414c8SBarry Smith         i1   = vi[1];
10044c1414c8SBarry Smith         tmp0 = tmps[i0];
10054c1414c8SBarry Smith         tmp1 = tmps[i1];
10064c1414c8SBarry Smith         v1 -= 2;
10074c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10084c1414c8SBarry Smith       }
10094c1414c8SBarry Smith       if (j == 1) {
10104c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10114c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10124c1414c8SBarry Smith       }
10139371c9d4SSatish Balay       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10149371c9d4SSatish Balay       row--;
10154c1414c8SBarry Smith       break;
10164c1414c8SBarry Smith     case 2:
10174c1414c8SBarry Smith       sum1 = tmp[row];
10184c1414c8SBarry Smith       sum2 = tmp[row - 1];
10194c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10204c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10214c1414c8SBarry Smith         vi -= 2;
10224c1414c8SBarry Smith         i0   = vi[2];
10234c1414c8SBarry Smith         i1   = vi[1];
10244c1414c8SBarry Smith         tmp0 = tmps[i0];
10254c1414c8SBarry Smith         tmp1 = tmps[i1];
10264c1414c8SBarry Smith         v1 -= 2;
10274c1414c8SBarry Smith         v2 -= 2;
10284c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10294c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10304c1414c8SBarry Smith       }
10314c1414c8SBarry Smith       if (j == 1) {
10324c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10334c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10344c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10354c1414c8SBarry Smith       }
10364c1414c8SBarry Smith 
10379371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10389371c9d4SSatish Balay       row--;
10394c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10409371c9d4SSatish Balay       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10419371c9d4SSatish Balay       row--;
10424c1414c8SBarry Smith       break;
10434c1414c8SBarry Smith     case 3:
10444c1414c8SBarry Smith       sum1 = tmp[row];
10454c1414c8SBarry Smith       sum2 = tmp[row - 1];
10464c1414c8SBarry Smith       sum3 = tmp[row - 2];
10474c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10484c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10494c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10504c1414c8SBarry Smith         vi -= 2;
10514c1414c8SBarry Smith         i0   = vi[2];
10524c1414c8SBarry Smith         i1   = vi[1];
10534c1414c8SBarry Smith         tmp0 = tmps[i0];
10544c1414c8SBarry Smith         tmp1 = tmps[i1];
10554c1414c8SBarry Smith         v1 -= 2;
10564c1414c8SBarry Smith         v2 -= 2;
10574c1414c8SBarry Smith         v3 -= 2;
10584c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10594c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10604c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10614c1414c8SBarry Smith       }
10624c1414c8SBarry Smith       if (j == 1) {
10634c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10644c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10654c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10664c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
10674c1414c8SBarry Smith       }
10689371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10699371c9d4SSatish Balay       row--;
10704c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10714c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10729371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10739371c9d4SSatish Balay       row--;
10744c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10759371c9d4SSatish Balay       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
10769371c9d4SSatish Balay       row--;
10774c1414c8SBarry Smith 
10784c1414c8SBarry Smith       break;
10794c1414c8SBarry Smith     case 4:
10804c1414c8SBarry Smith       sum1 = tmp[row];
10814c1414c8SBarry Smith       sum2 = tmp[row - 1];
10824c1414c8SBarry Smith       sum3 = tmp[row - 2];
10834c1414c8SBarry Smith       sum4 = tmp[row - 3];
10844c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10854c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10864c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
10874c1414c8SBarry Smith 
10884c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10894c1414c8SBarry Smith         vi -= 2;
10904c1414c8SBarry Smith         i0   = vi[2];
10914c1414c8SBarry Smith         i1   = vi[1];
10924c1414c8SBarry Smith         tmp0 = tmps[i0];
10934c1414c8SBarry Smith         tmp1 = tmps[i1];
10944c1414c8SBarry Smith         v1 -= 2;
10954c1414c8SBarry Smith         v2 -= 2;
10964c1414c8SBarry Smith         v3 -= 2;
10974c1414c8SBarry Smith         v4 -= 2;
10984c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10994c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11004c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11014c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11024c1414c8SBarry Smith       }
11034c1414c8SBarry Smith       if (j == 1) {
11044c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11054c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11064c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11074c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11084c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11094c1414c8SBarry Smith       }
11104c1414c8SBarry Smith 
11119371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11129371c9d4SSatish Balay       row--;
11134c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11144c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11154c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11169371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11179371c9d4SSatish Balay       row--;
11184c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11194c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11209371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11219371c9d4SSatish Balay       row--;
11224c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11239371c9d4SSatish Balay       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11249371c9d4SSatish Balay       row--;
11254c1414c8SBarry Smith       break;
11264c1414c8SBarry Smith     case 5:
11274c1414c8SBarry Smith       sum1 = tmp[row];
11284c1414c8SBarry Smith       sum2 = tmp[row - 1];
11294c1414c8SBarry Smith       sum3 = tmp[row - 2];
11304c1414c8SBarry Smith       sum4 = tmp[row - 3];
11314c1414c8SBarry Smith       sum5 = tmp[row - 4];
11324c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
11334c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
11344c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
11354c1414c8SBarry Smith       v5   = aa + ai[row - 3] - 1;
11364c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
11374c1414c8SBarry Smith         vi -= 2;
11384c1414c8SBarry Smith         i0   = vi[2];
11394c1414c8SBarry Smith         i1   = vi[1];
11404c1414c8SBarry Smith         tmp0 = tmps[i0];
11414c1414c8SBarry Smith         tmp1 = tmps[i1];
11424c1414c8SBarry Smith         v1 -= 2;
11434c1414c8SBarry Smith         v2 -= 2;
11444c1414c8SBarry Smith         v3 -= 2;
11454c1414c8SBarry Smith         v4 -= 2;
11464c1414c8SBarry Smith         v5 -= 2;
11474c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11484c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11494c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11504c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11514c1414c8SBarry Smith         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
11524c1414c8SBarry Smith       }
11534c1414c8SBarry Smith       if (j == 1) {
11544c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11554c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11564c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11574c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11584c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11594c1414c8SBarry Smith         sum5 -= *v5-- * tmp0;
11604c1414c8SBarry Smith       }
11614c1414c8SBarry Smith 
11629371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11639371c9d4SSatish Balay       row--;
11644c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11654c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11664c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11674c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11689371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11699371c9d4SSatish Balay       row--;
11704c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11714c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11724c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11739371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11749371c9d4SSatish Balay       row--;
11754c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11764c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11779371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11789371c9d4SSatish Balay       row--;
11794c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11809371c9d4SSatish Balay       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
11819371c9d4SSatish Balay       row--;
11824c1414c8SBarry Smith       break;
1183d71ae5a4SJacob Faibussowitsch     default:
1184d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
11854c1414c8SBarry Smith     }
11864c1414c8SBarry Smith   }
11879566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
11889566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
11899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
11909566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
11919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
11923ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
11934c1414c8SBarry Smith }
11944c1414c8SBarry Smith 
1195d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1196d71ae5a4SJacob Faibussowitsch {
119728f1b45aSHong Zhang   Mat              C = B;
119828f1b45aSHong Zhang   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
119928f1b45aSHong Zhang   IS               isrow = b->row, isicol = b->icol;
120028f1b45aSHong Zhang   const PetscInt  *r, *ic, *ics;
120128f1b45aSHong Zhang   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
120228f1b45aSHong Zhang   PetscInt         i, j, k, nz, nzL, row, *pj;
120328f1b45aSHong Zhang   const PetscInt  *ajtmp, *bjtmp;
12049877982aSShri Abhyankar   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
12059877982aSShri Abhyankar   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
120628f1b45aSHong Zhang   FactorShiftCtx   sctx;
12074f81c4b7SBarry Smith   const PetscInt  *ddiag;
120828f1b45aSHong Zhang   PetscReal        rs;
120928f1b45aSHong Zhang   MatScalar        d;
12104f81c4b7SBarry Smith   PetscInt         inod, nodesz, node_max, col;
12114f81c4b7SBarry Smith   const PetscInt  *ns;
121207b50cabSHong Zhang   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
12130e95ead3SHong Zhang 
121428f1b45aSHong Zhang   PetscFunctionBegin;
121528f1b45aSHong Zhang   /* MatPivotSetUp(): initialize shift context sctx */
12169566063dSJacob Faibussowitsch   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
121728f1b45aSHong Zhang 
1218f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
121928f1b45aSHong Zhang     ddiag          = a->diag;
122028f1b45aSHong Zhang     sctx.shift_top = info->zeropivot;
122128f1b45aSHong Zhang     for (i = 0; i < n; i++) {
122228f1b45aSHong Zhang       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
122328f1b45aSHong Zhang       d  = (aa)[ddiag[i]];
122428f1b45aSHong Zhang       rs = -PetscAbsScalar(d) - PetscRealPart(d);
122528f1b45aSHong Zhang       v  = aa + ai[i];
122628f1b45aSHong Zhang       nz = ai[i + 1] - ai[i];
12272205254eSKarl Rupp       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
122828f1b45aSHong Zhang       if (rs > sctx.shift_top) sctx.shift_top = rs;
122928f1b45aSHong Zhang     }
123028f1b45aSHong Zhang     sctx.shift_top *= 1.1;
123128f1b45aSHong Zhang     sctx.nshift_max = 5;
123228f1b45aSHong Zhang     sctx.shift_lo   = 0.;
123328f1b45aSHong Zhang     sctx.shift_hi   = 1.;
123428f1b45aSHong Zhang   }
123528f1b45aSHong Zhang 
12369566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
12379566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
123868785679SHong Zhang 
12399566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
124028f1b45aSHong Zhang   ics = ic;
124128f1b45aSHong Zhang 
124228f1b45aSHong Zhang   node_max = a->inode.node_count;
1243*4d12350bSJunchao Zhang   ns       = a->inode.size_csr;
124428b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
124528f1b45aSHong Zhang 
12469877982aSShri Abhyankar   /* If max inode size > 4, split it into two inodes.*/
124768785679SHong Zhang   /* also map the inode sizes according to the ordering */
12489566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
124968785679SHong Zhang   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1250*4d12350bSJunchao Zhang     nodesz = ns[i + 1] - ns[i];
1251*4d12350bSJunchao Zhang     if (nodesz > 4) {
1252048b5e81SShri Abhyankar       tmp_vec1[j] = 4;
125368785679SHong Zhang       ++j;
1254*4d12350bSJunchao Zhang       tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
125568785679SHong Zhang     } else {
1256*4d12350bSJunchao Zhang       tmp_vec1[j] = nodesz;
125768785679SHong Zhang     }
125868785679SHong Zhang   }
125968785679SHong Zhang   /* Use the correct node_max */
126068785679SHong Zhang   node_max = j;
126168785679SHong Zhang 
126268785679SHong Zhang   /* Now reorder the inode info based on mat re-ordering info */
126368785679SHong Zhang   /* First create a row -> inode_size_array_index map */
12649566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &nsmap));
12659566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
1266*4d12350bSJunchao Zhang   tmp_vec2[0] = 0;
126768785679SHong Zhang   for (i = 0, row = 0; i < node_max; i++) {
126868785679SHong Zhang     nodesz = tmp_vec1[i];
1269ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
127068785679SHong Zhang   }
127168785679SHong Zhang   /* Using nsmap, create a reordered ns structure */
127268785679SHong Zhang   for (i = 0, j = 0; i < node_max; i++) {
127368785679SHong Zhang     nodesz          = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
1274*4d12350bSJunchao Zhang     tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
127568785679SHong Zhang     j += nodesz;
127668785679SHong Zhang   }
12779566063dSJacob Faibussowitsch   PetscCall(PetscFree(nsmap));
12789566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec1));
1279b89f182dSHong Zhang 
128068785679SHong Zhang   /* Now use the correct ns */
128168785679SHong Zhang   ns = tmp_vec2;
128268785679SHong Zhang 
128328f1b45aSHong Zhang   do {
128407b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
128528f1b45aSHong Zhang     /* Now loop over each block-row, and do the factorization */
128628f1b45aSHong Zhang     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
1287*4d12350bSJunchao Zhang       nodesz = ns[inod + 1] - ns[inod];
128828f1b45aSHong Zhang 
128928f1b45aSHong Zhang       switch (nodesz) {
129028f1b45aSHong Zhang       case 1:
1291b89f182dSHong Zhang         /* zero rtmp1 */
129228f1b45aSHong Zhang         /* L part */
129328f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
129428f1b45aSHong Zhang         bjtmp = bj + bi[i];
1295b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
129628f1b45aSHong Zhang 
129728f1b45aSHong Zhang         /* U part */
129828f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
129928f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
1300b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
130128f1b45aSHong Zhang 
130228f1b45aSHong Zhang         /* load in initial (unfactored row) */
130328f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
130428f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
130528f1b45aSHong Zhang         v     = aa + ai[r[i]];
13062205254eSKarl Rupp         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
13072205254eSKarl Rupp 
130828f1b45aSHong Zhang         /* ZeropivotApply() */
1309b89f182dSHong Zhang         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
131028f1b45aSHong Zhang 
131128f1b45aSHong Zhang         /* elimination */
131228f1b45aSHong Zhang         bjtmp = bj + bi[i];
131328f1b45aSHong Zhang         row   = *bjtmp++;
131428f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
131528f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1316b89f182dSHong Zhang           pc = rtmp1 + row;
131728f1b45aSHong Zhang           if (*pc != 0.0) {
131828f1b45aSHong Zhang             pv   = b->a + bdiag[row];
1319b89f182dSHong Zhang             mul1 = *pc * (*pv);
1320b89f182dSHong Zhang             *pc  = mul1;
132128f1b45aSHong Zhang             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
132228f1b45aSHong Zhang             pv   = b->a + bdiag[row + 1] + 1;
132328f1b45aSHong Zhang             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1324b89f182dSHong Zhang             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
13259566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz));
132628f1b45aSHong Zhang           }
132728f1b45aSHong Zhang           row = *bjtmp++;
132828f1b45aSHong Zhang         }
132928f1b45aSHong Zhang 
133028f1b45aSHong Zhang         /* finished row so stick it into b->a */
133128f1b45aSHong Zhang         rs = 0.0;
133228f1b45aSHong Zhang         /* L part */
133328f1b45aSHong Zhang         pv = b->a + bi[i];
133428f1b45aSHong Zhang         pj = b->j + bi[i];
133528f1b45aSHong Zhang         nz = bi[i + 1] - bi[i];
133628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13379371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13389371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
133928f1b45aSHong Zhang         }
134028f1b45aSHong Zhang 
134128f1b45aSHong Zhang         /* U part */
134228f1b45aSHong Zhang         pv = b->a + bdiag[i + 1] + 1;
134328f1b45aSHong Zhang         pj = b->j + bdiag[i + 1] + 1;
134428f1b45aSHong Zhang         nz = bdiag[i] - bdiag[i + 1] - 1;
134528f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13469371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13479371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
134828f1b45aSHong Zhang         }
134928f1b45aSHong Zhang 
1350b89f182dSHong Zhang         /* Check zero pivot */
135128f1b45aSHong Zhang         sctx.rs = rs;
1352b89f182dSHong Zhang         sctx.pv = rtmp1[i];
13539566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
135407b50cabSHong Zhang         if (sctx.newshift) break;
135528f1b45aSHong Zhang 
1356a5b23f4aSJose E. Roman         /* Mark diagonal and invert diagonal for simpler triangular solves */
135728f1b45aSHong Zhang         pv  = b->a + bdiag[i];
1358b89f182dSHong Zhang         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
135928f1b45aSHong Zhang         break;
136028f1b45aSHong Zhang 
136128f1b45aSHong Zhang       case 2:
1362b89f182dSHong Zhang         /* zero rtmp1 and rtmp2 */
136328f1b45aSHong Zhang         /* L part */
136428f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
136528f1b45aSHong Zhang         bjtmp = bj + bi[i];
136628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
136768785679SHong Zhang           col        = bjtmp[j];
13689371c9d4SSatish Balay           rtmp1[col] = 0.0;
13699371c9d4SSatish Balay           rtmp2[col] = 0.0;
137028f1b45aSHong Zhang         }
137128f1b45aSHong Zhang 
137228f1b45aSHong Zhang         /* U part */
137328f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
137428f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
137528f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
137668785679SHong Zhang           col        = bjtmp[j];
13779371c9d4SSatish Balay           rtmp1[col] = 0.0;
13789371c9d4SSatish Balay           rtmp2[col] = 0.0;
137928f1b45aSHong Zhang         }
138028f1b45aSHong Zhang 
138128f1b45aSHong Zhang         /* load in initial (unfactored row) */
138228f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
138328f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
13849371c9d4SSatish Balay         v1    = aa + ai[r[i]];
13859371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
138628f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
138768785679SHong Zhang           col        = ics[ajtmp[j]];
13889371c9d4SSatish Balay           rtmp1[col] = v1[j];
13899371c9d4SSatish Balay           rtmp2[col] = v2[j];
139028f1b45aSHong Zhang         }
139128f1b45aSHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
13929371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
13939371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
139428f1b45aSHong Zhang 
139528f1b45aSHong Zhang         /* elimination */
139628f1b45aSHong Zhang         bjtmp = bj + bi[i];
139728f1b45aSHong Zhang         row   = *bjtmp++; /* pivot row */
139828f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
139928f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1400b89f182dSHong Zhang           pc1 = rtmp1 + row;
1401b89f182dSHong Zhang           pc2 = rtmp2 + row;
140228f1b45aSHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0) {
140328f1b45aSHong Zhang             pv   = b->a + bdiag[row];
14049371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
14059371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
14069371c9d4SSatish Balay             *pc1 = mul1;
14079371c9d4SSatish Balay             *pc2 = mul2;
140828f1b45aSHong Zhang 
140928f1b45aSHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
141028f1b45aSHong Zhang             pv = b->a + bdiag[row + 1] + 1;
141128f1b45aSHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
141228f1b45aSHong Zhang             for (j = 0; j < nz; j++) {
141368785679SHong Zhang               col = pj[j];
1414b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1415b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
141628f1b45aSHong Zhang             }
14179566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz));
141828f1b45aSHong Zhang           }
141928f1b45aSHong Zhang           row = *bjtmp++;
142028f1b45aSHong Zhang         }
142128f1b45aSHong Zhang 
1422b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
142328f1b45aSHong Zhang         rs = 0.0;
142428f1b45aSHong Zhang         /* L part */
1425b89f182dSHong Zhang         pc1 = b->a + bi[i];
142628f1b45aSHong Zhang         pj  = b->j + bi[i];
142728f1b45aSHong Zhang         nz  = bi[i + 1] - bi[i];
142828f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
142968785679SHong Zhang           col    = pj[j];
14309371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14319371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
143228f1b45aSHong Zhang         }
143328f1b45aSHong Zhang         /* U part */
1434b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
143528f1b45aSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
14360e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
143728f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
143868785679SHong Zhang           col    = pj[j];
14399371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14409371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
144128f1b45aSHong Zhang         }
144228f1b45aSHong Zhang 
144328f1b45aSHong Zhang         sctx.rs = rs;
1444b89f182dSHong Zhang         sctx.pv = rtmp1[i];
14459566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
144607b50cabSHong Zhang         if (sctx.newshift) break;
1447b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1448b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1449b89f182dSHong Zhang 
1450b89f182dSHong Zhang         /* Now take care of diagonal 2x2 block. */
1451b89f182dSHong Zhang         pc2 = rtmp2 + i;
1452b89f182dSHong Zhang         if (*pc2 != 0.0) {
1453b89f182dSHong Zhang           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1454b89f182dSHong Zhang           *pc2 = mul1;                        /* insert L entry */
1455b89f182dSHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1456b89f182dSHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1457b89f182dSHong Zhang           for (j = 0; j < nz; j++) {
14589371c9d4SSatish Balay             col = pj[j];
14599371c9d4SSatish Balay             rtmp2[col] -= mul1 * rtmp1[col];
146028f1b45aSHong Zhang           }
14619566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1462b89f182dSHong Zhang         }
1463b89f182dSHong Zhang 
1464b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1465b89f182dSHong Zhang         rs = 0.0;
1466b89f182dSHong Zhang         /* L part */
1467b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1468b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1469b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1470b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1471b89f182dSHong Zhang           col    = pj[j];
14729371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14739371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1474b89f182dSHong Zhang         }
1475b89f182dSHong Zhang         /* U part */
1476b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
14770e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
14780e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1479b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1480b89f182dSHong Zhang           col    = pj[j];
14819371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14829371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1483b89f182dSHong Zhang         }
1484b89f182dSHong Zhang 
148528f1b45aSHong Zhang         sctx.rs = rs;
1486b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
14879566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
148807b50cabSHong Zhang         if (sctx.newshift) break;
148928f1b45aSHong Zhang         pc2  = b->a + bdiag[i + 1];
1490b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv;
149128f1b45aSHong Zhang         break;
1492b89f182dSHong Zhang 
149368785679SHong Zhang       case 3:
149468785679SHong Zhang         /* zero rtmp */
149568785679SHong Zhang         /* L part */
149668785679SHong Zhang         nz    = bi[i + 1] - bi[i];
149768785679SHong Zhang         bjtmp = bj + bi[i];
149868785679SHong Zhang         for (j = 0; j < nz; j++) {
149968785679SHong Zhang           col        = bjtmp[j];
15009371c9d4SSatish Balay           rtmp1[col] = 0.0;
15019371c9d4SSatish Balay           rtmp2[col] = 0.0;
15029371c9d4SSatish Balay           rtmp3[col] = 0.0;
150368785679SHong Zhang         }
150468785679SHong Zhang 
150568785679SHong Zhang         /* U part */
150668785679SHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
150768785679SHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
150868785679SHong Zhang         for (j = 0; j < nz; j++) {
150968785679SHong Zhang           col        = bjtmp[j];
15109371c9d4SSatish Balay           rtmp1[col] = 0.0;
15119371c9d4SSatish Balay           rtmp2[col] = 0.0;
15129371c9d4SSatish Balay           rtmp3[col] = 0.0;
151368785679SHong Zhang         }
151468785679SHong Zhang 
151568785679SHong Zhang         /* load in initial (unfactored row) */
151668785679SHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
151768785679SHong Zhang         ajtmp = aj + ai[r[i]];
15189371c9d4SSatish Balay         v1    = aa + ai[r[i]];
15199371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
15209371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
152168785679SHong Zhang         for (j = 0; j < nz; j++) {
152268785679SHong Zhang           col        = ics[ajtmp[j]];
15239371c9d4SSatish Balay           rtmp1[col] = v1[j];
15249371c9d4SSatish Balay           rtmp2[col] = v2[j];
15259371c9d4SSatish Balay           rtmp3[col] = v3[j];
152668785679SHong Zhang         }
152768785679SHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
15289371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
15299371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
15309371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
153168785679SHong Zhang 
153268785679SHong Zhang         /* elimination */
153368785679SHong Zhang         bjtmp = bj + bi[i];
153468785679SHong Zhang         row   = *bjtmp++; /* pivot row */
153568785679SHong Zhang         nzL   = bi[i + 1] - bi[i];
153668785679SHong Zhang         for (k = 0; k < nzL; k++) {
1537b89f182dSHong Zhang           pc1 = rtmp1 + row;
1538b89f182dSHong Zhang           pc2 = rtmp2 + row;
1539b89f182dSHong Zhang           pc3 = rtmp3 + row;
154068785679SHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
154168785679SHong Zhang             pv   = b->a + bdiag[row];
15429371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
15439371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
15449371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
15459371c9d4SSatish Balay             *pc1 = mul1;
15469371c9d4SSatish Balay             *pc2 = mul2;
15479371c9d4SSatish Balay             *pc3 = mul3;
154868785679SHong Zhang 
154968785679SHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
155068785679SHong Zhang             pv = b->a + bdiag[row + 1] + 1;
155168785679SHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
155268785679SHong Zhang             for (j = 0; j < nz; j++) {
155368785679SHong Zhang               col = pj[j];
1554b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1555b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
1556b89f182dSHong Zhang               rtmp3[col] -= mul3 * pv[j];
155768785679SHong Zhang             }
15589566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz));
155968785679SHong Zhang           }
156068785679SHong Zhang           row = *bjtmp++;
156168785679SHong Zhang         }
156268785679SHong Zhang 
1563b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
1564b89f182dSHong Zhang         rs = 0.0;
1565b89f182dSHong Zhang         /* L part */
1566b89f182dSHong Zhang         pc1 = b->a + bi[i];
1567b89f182dSHong Zhang         pj  = b->j + bi[i];
1568b89f182dSHong Zhang         nz  = bi[i + 1] - bi[i];
1569b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1570b89f182dSHong Zhang           col    = pj[j];
15719371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15729371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1573b89f182dSHong Zhang         }
1574b89f182dSHong Zhang         /* U part */
1575b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
1576b89f182dSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
15770e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1578b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1579b89f182dSHong Zhang           col    = pj[j];
15809371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15819371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1582b89f182dSHong Zhang         }
158368785679SHong Zhang 
1584b89f182dSHong Zhang         sctx.rs = rs;
1585b89f182dSHong Zhang         sctx.pv = rtmp1[i];
15869566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
158707b50cabSHong Zhang         if (sctx.newshift) break;
1588b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1589b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1590b89f182dSHong Zhang 
1591b89f182dSHong Zhang         /* Now take care of 1st column of diagonal 3x3 block. */
1592b89f182dSHong Zhang         pc2 = rtmp2 + i;
1593b89f182dSHong Zhang         pc3 = rtmp3 + i;
1594b89f182dSHong Zhang         if (*pc2 != 0.0 || *pc3 != 0.0) {
15959371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
15969371c9d4SSatish Balay           *pc2 = mul2;
15979371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
15989371c9d4SSatish Balay           *pc3 = mul3;
159968785679SHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
160068785679SHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
160168785679SHong Zhang           for (j = 0; j < nz; j++) {
160268785679SHong Zhang             col = pj[j];
1603b89f182dSHong Zhang             rtmp2[col] -= mul2 * rtmp1[col];
1604b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp1[col];
160568785679SHong Zhang           }
16069566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz));
160768785679SHong Zhang         }
160868785679SHong Zhang 
1609b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1610b89f182dSHong Zhang         rs = 0.0;
1611b89f182dSHong Zhang         /* L part */
1612b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1613b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1614b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1615b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1616b89f182dSHong Zhang           col    = pj[j];
16179371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16189371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1619b89f182dSHong Zhang         }
1620b89f182dSHong Zhang         /* U part */
1621b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
16220e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
16230e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1624b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1625b89f182dSHong Zhang           col    = pj[j];
16269371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16279371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1628b89f182dSHong Zhang         }
1629b89f182dSHong Zhang 
1630b89f182dSHong Zhang         sctx.rs = rs;
1631b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
16329566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
163307b50cabSHong Zhang         if (sctx.newshift) break;
1634b89f182dSHong Zhang         pc2  = b->a + bdiag[i + 1];
1635b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1636b89f182dSHong Zhang 
1637b89f182dSHong Zhang         /* Now take care of 2nd column of diagonal 3x3 block. */
1638b89f182dSHong Zhang         pc3 = rtmp3 + i + 1;
163968785679SHong Zhang         if (*pc3 != 0.0) {
16409371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
16419371c9d4SSatish Balay           *pc3 = mul3;
164268785679SHong Zhang           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
164368785679SHong Zhang           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
164468785679SHong Zhang           for (j = 0; j < nz; j++) {
164568785679SHong Zhang             col = pj[j];
1646b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp2[col];
164768785679SHong Zhang           }
16489566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
164968785679SHong Zhang         }
165068785679SHong Zhang 
1651b89f182dSHong Zhang         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
165268785679SHong Zhang         rs = 0.0;
165368785679SHong Zhang         /* L part */
1654b89f182dSHong Zhang         pc3 = b->a + bi[i + 2];
1655b89f182dSHong Zhang         pj  = b->j + bi[i + 2];
1656b89f182dSHong Zhang         nz  = bi[i + 3] - bi[i + 2];
165768785679SHong Zhang         for (j = 0; j < nz; j++) {
165868785679SHong Zhang           col    = pj[j];
16599371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16609371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
166168785679SHong Zhang         }
166268785679SHong Zhang         /* U part */
1663b89f182dSHong Zhang         pc3 = b->a + bdiag[i + 3] + 1;
16640e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 3] + 1;
16650e7a5c2bSHong Zhang         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
166668785679SHong Zhang         for (j = 0; j < nz; j++) {
166768785679SHong Zhang           col    = pj[j];
16689371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16699371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
167068785679SHong Zhang         }
167168785679SHong Zhang 
167268785679SHong Zhang         sctx.rs = rs;
1673b89f182dSHong Zhang         sctx.pv = rtmp3[i + 2];
16749566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
167507b50cabSHong Zhang         if (sctx.newshift) break;
167668785679SHong Zhang         pc3  = b->a + bdiag[i + 2];
1677b89f182dSHong Zhang         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
167868785679SHong Zhang         break;
16799877982aSShri Abhyankar       case 4:
16809877982aSShri Abhyankar         /* zero rtmp */
16819877982aSShri Abhyankar         /* L part */
16829877982aSShri Abhyankar         nz    = bi[i + 1] - bi[i];
16839877982aSShri Abhyankar         bjtmp = bj + bi[i];
16849877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16859877982aSShri Abhyankar           col        = bjtmp[j];
16869371c9d4SSatish Balay           rtmp1[col] = 0.0;
16879371c9d4SSatish Balay           rtmp2[col] = 0.0;
16889371c9d4SSatish Balay           rtmp3[col] = 0.0;
16899371c9d4SSatish Balay           rtmp4[col] = 0.0;
16909877982aSShri Abhyankar         }
16919877982aSShri Abhyankar 
16929877982aSShri Abhyankar         /* U part */
16939877982aSShri Abhyankar         nz    = bdiag[i] - bdiag[i + 1];
16949877982aSShri Abhyankar         bjtmp = bj + bdiag[i + 1] + 1;
16959877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16969877982aSShri Abhyankar           col        = bjtmp[j];
16979371c9d4SSatish Balay           rtmp1[col] = 0.0;
16989371c9d4SSatish Balay           rtmp2[col] = 0.0;
16999371c9d4SSatish Balay           rtmp3[col] = 0.0;
17009371c9d4SSatish Balay           rtmp4[col] = 0.0;
17019877982aSShri Abhyankar         }
17029877982aSShri Abhyankar 
17039877982aSShri Abhyankar         /* load in initial (unfactored row) */
17049877982aSShri Abhyankar         nz    = ai[r[i] + 1] - ai[r[i]];
17059877982aSShri Abhyankar         ajtmp = aj + ai[r[i]];
17069371c9d4SSatish Balay         v1    = aa + ai[r[i]];
17079371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
17089371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
17099371c9d4SSatish Balay         v4    = aa + ai[r[i] + 3];
17109877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17119877982aSShri Abhyankar           col        = ics[ajtmp[j]];
17129371c9d4SSatish Balay           rtmp1[col] = v1[j];
17139371c9d4SSatish Balay           rtmp2[col] = v2[j];
17149371c9d4SSatish Balay           rtmp3[col] = v3[j];
17159371c9d4SSatish Balay           rtmp4[col] = v4[j];
17169877982aSShri Abhyankar         }
17179877982aSShri Abhyankar         /* ZeropivotApply(): shift the diagonal of the matrix  */
17189371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
17199371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
17209371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
17219371c9d4SSatish Balay         rtmp4[i + 3] += sctx.shift_amount;
17229877982aSShri Abhyankar 
17239877982aSShri Abhyankar         /* elimination */
17249877982aSShri Abhyankar         bjtmp = bj + bi[i];
17259877982aSShri Abhyankar         row   = *bjtmp++; /* pivot row */
17269877982aSShri Abhyankar         nzL   = bi[i + 1] - bi[i];
17279877982aSShri Abhyankar         for (k = 0; k < nzL; k++) {
17289877982aSShri Abhyankar           pc1 = rtmp1 + row;
17299877982aSShri Abhyankar           pc2 = rtmp2 + row;
17309877982aSShri Abhyankar           pc3 = rtmp3 + row;
17319877982aSShri Abhyankar           pc4 = rtmp4 + row;
17329877982aSShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17339877982aSShri Abhyankar             pv   = b->a + bdiag[row];
17349371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
17359371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
17369371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
17379371c9d4SSatish Balay             mul4 = *pc4 * (*pv);
17389371c9d4SSatish Balay             *pc1 = mul1;
17399371c9d4SSatish Balay             *pc2 = mul2;
17409371c9d4SSatish Balay             *pc3 = mul3;
17419371c9d4SSatish Balay             *pc4 = mul4;
17429877982aSShri Abhyankar 
17439877982aSShri Abhyankar             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
17449877982aSShri Abhyankar             pv = b->a + bdiag[row + 1] + 1;
17459877982aSShri Abhyankar             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
17469877982aSShri Abhyankar             for (j = 0; j < nz; j++) {
17479877982aSShri Abhyankar               col = pj[j];
17489877982aSShri Abhyankar               rtmp1[col] -= mul1 * pv[j];
17499877982aSShri Abhyankar               rtmp2[col] -= mul2 * pv[j];
17509877982aSShri Abhyankar               rtmp3[col] -= mul3 * pv[j];
17519877982aSShri Abhyankar               rtmp4[col] -= mul4 * pv[j];
17529877982aSShri Abhyankar             }
17539566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(4 + 8.0 * nz));
17549877982aSShri Abhyankar           }
17559877982aSShri Abhyankar           row = *bjtmp++;
17569877982aSShri Abhyankar         }
17579877982aSShri Abhyankar 
17589877982aSShri Abhyankar         /* finished row i; check zero pivot, then stick row i into b->a */
17599877982aSShri Abhyankar         rs = 0.0;
17609877982aSShri Abhyankar         /* L part */
17619877982aSShri Abhyankar         pc1 = b->a + bi[i];
17629877982aSShri Abhyankar         pj  = b->j + bi[i];
17639877982aSShri Abhyankar         nz  = bi[i + 1] - bi[i];
17649877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17659877982aSShri Abhyankar           col    = pj[j];
17669371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17679371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17689877982aSShri Abhyankar         }
17699877982aSShri Abhyankar         /* U part */
17709877982aSShri Abhyankar         pc1 = b->a + bdiag[i + 1] + 1;
17719877982aSShri Abhyankar         pj  = b->j + bdiag[i + 1] + 1;
17729877982aSShri Abhyankar         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
17739877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17749877982aSShri Abhyankar           col    = pj[j];
17759371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17769371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17779877982aSShri Abhyankar         }
17789877982aSShri Abhyankar 
17799877982aSShri Abhyankar         sctx.rs = rs;
17809877982aSShri Abhyankar         sctx.pv = rtmp1[i];
17819566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
178207b50cabSHong Zhang         if (sctx.newshift) break;
17839877982aSShri Abhyankar         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
17849877982aSShri Abhyankar         *pc1 = 1.0 / sctx.pv;
17859877982aSShri Abhyankar 
17869877982aSShri Abhyankar         /* Now take care of 1st column of diagonal 4x4 block. */
17879877982aSShri Abhyankar         pc2 = rtmp2 + i;
17889877982aSShri Abhyankar         pc3 = rtmp3 + i;
17899877982aSShri Abhyankar         pc4 = rtmp4 + i;
17909877982aSShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17919371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
17929371c9d4SSatish Balay           *pc2 = mul2;
17939371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
17949371c9d4SSatish Balay           *pc3 = mul3;
17959371c9d4SSatish Balay           mul4 = (*pc4) * (*pc1);
17969371c9d4SSatish Balay           *pc4 = mul4;
17979877982aSShri Abhyankar           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
17989877982aSShri Abhyankar           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
17999877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18009877982aSShri Abhyankar             col = pj[j];
18019877982aSShri Abhyankar             rtmp2[col] -= mul2 * rtmp1[col];
18029877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp1[col];
18039877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp1[col];
18049877982aSShri Abhyankar           }
18059566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(3 + 6.0 * nz));
18069877982aSShri Abhyankar         }
18079877982aSShri Abhyankar 
18089877982aSShri Abhyankar         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
18099877982aSShri Abhyankar         rs = 0.0;
18109877982aSShri Abhyankar         /* L part */
18119877982aSShri Abhyankar         pc2 = b->a + bi[i + 1];
18129877982aSShri Abhyankar         pj  = b->j + bi[i + 1];
18139877982aSShri Abhyankar         nz  = bi[i + 2] - bi[i + 1];
18149877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18159877982aSShri Abhyankar           col    = pj[j];
18169371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18179371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18189877982aSShri Abhyankar         }
18199877982aSShri Abhyankar         /* U part */
18209877982aSShri Abhyankar         pc2 = b->a + bdiag[i + 2] + 1;
18219877982aSShri Abhyankar         pj  = b->j + bdiag[i + 2] + 1;
18229877982aSShri Abhyankar         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
18239877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18249877982aSShri Abhyankar           col    = pj[j];
18259371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18269371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18279877982aSShri Abhyankar         }
18289877982aSShri Abhyankar 
18299877982aSShri Abhyankar         sctx.rs = rs;
18309877982aSShri Abhyankar         sctx.pv = rtmp2[i + 1];
18319566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
183207b50cabSHong Zhang         if (sctx.newshift) break;
18339877982aSShri Abhyankar         pc2  = b->a + bdiag[i + 1];
18349877982aSShri Abhyankar         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
18359877982aSShri Abhyankar 
18369877982aSShri Abhyankar         /* Now take care of 2nd column of diagonal 4x4 block. */
18379877982aSShri Abhyankar         pc3 = rtmp3 + i + 1;
18389877982aSShri Abhyankar         pc4 = rtmp4 + i + 1;
18399877982aSShri Abhyankar         if (*pc3 != 0.0 || *pc4 != 0.0) {
18409371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
18419371c9d4SSatish Balay           *pc3 = mul3;
18429371c9d4SSatish Balay           mul4 = (*pc4) * (*pc2);
18439371c9d4SSatish Balay           *pc4 = mul4;
18449877982aSShri Abhyankar           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
18459877982aSShri Abhyankar           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
18469877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18479877982aSShri Abhyankar             col = pj[j];
18489877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp2[col];
18499877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp2[col];
18509877982aSShri Abhyankar           }
18519566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(4.0 * nz));
18529877982aSShri Abhyankar         }
18539877982aSShri Abhyankar 
18549877982aSShri Abhyankar         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
18559877982aSShri Abhyankar         rs = 0.0;
18569877982aSShri Abhyankar         /* L part */
18579877982aSShri Abhyankar         pc3 = b->a + bi[i + 2];
18589877982aSShri Abhyankar         pj  = b->j + bi[i + 2];
18599877982aSShri Abhyankar         nz  = bi[i + 3] - bi[i + 2];
18609877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18619877982aSShri Abhyankar           col    = pj[j];
18629371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18639371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18649877982aSShri Abhyankar         }
18659877982aSShri Abhyankar         /* U part */
18669877982aSShri Abhyankar         pc3 = b->a + bdiag[i + 3] + 1;
18679877982aSShri Abhyankar         pj  = b->j + bdiag[i + 3] + 1;
18689877982aSShri Abhyankar         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
18699877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18709877982aSShri Abhyankar           col    = pj[j];
18719371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18729371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18739877982aSShri Abhyankar         }
18749877982aSShri Abhyankar 
18759877982aSShri Abhyankar         sctx.rs = rs;
18769877982aSShri Abhyankar         sctx.pv = rtmp3[i + 2];
18779566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
187807b50cabSHong Zhang         if (sctx.newshift) break;
18799877982aSShri Abhyankar         pc3  = b->a + bdiag[i + 2];
18809877982aSShri Abhyankar         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
18819877982aSShri Abhyankar 
18829877982aSShri Abhyankar         /* Now take care of 3rd column of diagonal 4x4 block. */
18839877982aSShri Abhyankar         pc4 = rtmp4 + i + 2;
18849877982aSShri Abhyankar         if (*pc4 != 0.0) {
18859371c9d4SSatish Balay           mul4 = (*pc4) * (*pc3);
18869371c9d4SSatish Balay           *pc4 = mul4;
18879877982aSShri Abhyankar           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
18889877982aSShri Abhyankar           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
18899877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18909877982aSShri Abhyankar             col = pj[j];
18919877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp3[col];
18929877982aSShri Abhyankar           }
18939566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
18949877982aSShri Abhyankar         }
18959877982aSShri Abhyankar 
18969877982aSShri Abhyankar         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
18979877982aSShri Abhyankar         rs = 0.0;
18989877982aSShri Abhyankar         /* L part */
18999877982aSShri Abhyankar         pc4 = b->a + bi[i + 3];
19009877982aSShri Abhyankar         pj  = b->j + bi[i + 3];
19019877982aSShri Abhyankar         nz  = bi[i + 4] - bi[i + 3];
19029877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19039877982aSShri Abhyankar           col    = pj[j];
19049371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19059371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19069877982aSShri Abhyankar         }
19079877982aSShri Abhyankar         /* U part */
19089877982aSShri Abhyankar         pc4 = b->a + bdiag[i + 4] + 1;
19099877982aSShri Abhyankar         pj  = b->j + bdiag[i + 4] + 1;
19109877982aSShri Abhyankar         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
19119877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19129877982aSShri Abhyankar           col    = pj[j];
19139371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19149371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19159877982aSShri Abhyankar         }
19169877982aSShri Abhyankar 
19179877982aSShri Abhyankar         sctx.rs = rs;
19189877982aSShri Abhyankar         sctx.pv = rtmp4[i + 3];
19199566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
192007b50cabSHong Zhang         if (sctx.newshift) break;
19219877982aSShri Abhyankar         pc4  = b->a + bdiag[i + 3];
19229877982aSShri Abhyankar         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
19239877982aSShri Abhyankar         break;
192468785679SHong Zhang 
1925d71ae5a4SJacob Faibussowitsch       default:
1926d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
192728f1b45aSHong Zhang       }
1928c2b86aeeSHong Zhang       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
192928f1b45aSHong Zhang       i += nodesz;              /* Update the row */
193068785679SHong Zhang     }
193128f1b45aSHong Zhang 
193228f1b45aSHong Zhang     /* MatPivotRefine() */
193307b50cabSHong Zhang     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
193428f1b45aSHong Zhang       /*
193528f1b45aSHong Zhang        * if no shift in this attempt & shifting & started shifting & can refine,
193628f1b45aSHong Zhang        * then try lower shift
193728f1b45aSHong Zhang        */
193828f1b45aSHong Zhang       sctx.shift_hi       = sctx.shift_fraction;
193928f1b45aSHong Zhang       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
194028f1b45aSHong Zhang       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
194107b50cabSHong Zhang       sctx.newshift       = PETSC_TRUE;
194228f1b45aSHong Zhang       sctx.nshift++;
194328f1b45aSHong Zhang     }
194407b50cabSHong Zhang   } while (sctx.newshift);
194528f1b45aSHong Zhang 
19469566063dSJacob Faibussowitsch   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
19479566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
19489566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
19499566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
195028f1b45aSHong Zhang 
1951*4d12350bSJunchao Zhang   if (b->inode.size_csr) {
1952abb87a52SBarry Smith     C->ops->solve = MatSolve_SeqAIJ_Inode;
1953abb87a52SBarry Smith   } else {
1954d3ac4fa3SBarry Smith     C->ops->solve = MatSolve_SeqAIJ;
1955abb87a52SBarry Smith   }
195628f1b45aSHong Zhang   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
195728f1b45aSHong Zhang   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
195828f1b45aSHong Zhang   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
195928f1b45aSHong Zhang   C->ops->matsolve          = MatMatSolve_SeqAIJ;
1960a3d9026eSPierre Jolivet   C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
196128f1b45aSHong Zhang   C->assembled              = PETSC_TRUE;
196228f1b45aSHong Zhang   C->preallocated           = PETSC_TRUE;
19632205254eSKarl Rupp 
19649566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
196528f1b45aSHong Zhang 
196628f1b45aSHong Zhang   /* MatShiftView(A,info,&sctx) */
196728f1b45aSHong Zhang   if (sctx.nshift) {
1968f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
19699566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1970f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
19719566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1972f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
19739566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
197428f1b45aSHong Zhang     }
197528f1b45aSHong Zhang   }
19763ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
197728f1b45aSHong Zhang }
1978628f99d7SShri Abhyankar 
1979ff6a9541SJacob Faibussowitsch #if 0
1980ff6a9541SJacob Faibussowitsch // unused
1981ff6a9541SJacob Faibussowitsch static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1982d71ae5a4SJacob Faibussowitsch {
1983628f99d7SShri Abhyankar   Mat              C = B;
1984628f99d7SShri Abhyankar   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1985628f99d7SShri Abhyankar   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1986628f99d7SShri Abhyankar   const PetscInt  *r, *ic, *c, *ics;
1987628f99d7SShri Abhyankar   PetscInt         n = A->rmap->n, *bi = b->i;
1988628f99d7SShri Abhyankar   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
19898758e1faSBarry Smith   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
19908758e1faSBarry Smith   PetscInt        *ai = a->i, *aj = a->j;
1991628f99d7SShri Abhyankar   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1992628f99d7SShri Abhyankar   PetscScalar      mul1, mul2, mul3, tmp;
1993628f99d7SShri Abhyankar   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1994628f99d7SShri Abhyankar   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1995628f99d7SShri Abhyankar   PetscReal        rs = 0.0;
1996628f99d7SShri Abhyankar   FactorShiftCtx   sctx;
1997628f99d7SShri Abhyankar 
1998628f99d7SShri Abhyankar   PetscFunctionBegin;
1999628f99d7SShri Abhyankar   sctx.shift_top      = 0;
2000628f99d7SShri Abhyankar   sctx.nshift_max     = 0;
2001628f99d7SShri Abhyankar   sctx.shift_lo       = 0;
2002628f99d7SShri Abhyankar   sctx.shift_hi       = 0;
2003628f99d7SShri Abhyankar   sctx.shift_fraction = 0;
2004628f99d7SShri Abhyankar 
2005628f99d7SShri Abhyankar   /* if both shift schemes are chosen by user, only use info->shiftpd */
2006f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2007628f99d7SShri Abhyankar     sctx.shift_top = 0;
2008628f99d7SShri Abhyankar     for (i = 0; i < n; i++) {
2009628f99d7SShri Abhyankar       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2010628f99d7SShri Abhyankar       rs    = 0.0;
2011628f99d7SShri Abhyankar       ajtmp = aj + ai[i];
2012628f99d7SShri Abhyankar       rtmp1 = aa + ai[i];
2013628f99d7SShri Abhyankar       nz    = ai[i + 1] - ai[i];
2014628f99d7SShri Abhyankar       for (j = 0; j < nz; j++) {
2015628f99d7SShri Abhyankar         if (*ajtmp != i) {
2016628f99d7SShri Abhyankar           rs += PetscAbsScalar(*rtmp1++);
2017628f99d7SShri Abhyankar         } else {
2018628f99d7SShri Abhyankar           rs -= PetscRealPart(*rtmp1++);
2019628f99d7SShri Abhyankar         }
2020628f99d7SShri Abhyankar         ajtmp++;
2021628f99d7SShri Abhyankar       }
2022628f99d7SShri Abhyankar       if (rs > sctx.shift_top) sctx.shift_top = rs;
2023628f99d7SShri Abhyankar     }
2024628f99d7SShri Abhyankar     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2025628f99d7SShri Abhyankar     sctx.shift_top *= 1.1;
2026628f99d7SShri Abhyankar     sctx.nshift_max = 5;
2027628f99d7SShri Abhyankar     sctx.shift_lo   = 0.;
2028628f99d7SShri Abhyankar     sctx.shift_hi   = 1.;
2029628f99d7SShri Abhyankar   }
2030628f99d7SShri Abhyankar   sctx.shift_amount = 0;
2031628f99d7SShri Abhyankar   sctx.nshift       = 0;
2032628f99d7SShri Abhyankar 
20339566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
20349566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &c));
20359566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
20369566063dSJacob Faibussowitsch   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2037628f99d7SShri Abhyankar   ics = ic;
2038628f99d7SShri Abhyankar 
2039628f99d7SShri Abhyankar   node_max = a->inode.node_count;
2040628f99d7SShri Abhyankar   ns       = a->inode.size;
204128b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2042628f99d7SShri Abhyankar 
2043628f99d7SShri Abhyankar   /* If max inode size > 3, split it into two inodes.*/
2044628f99d7SShri Abhyankar   /* also map the inode sizes according to the ordering */
20459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2046628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2047628f99d7SShri Abhyankar     if (ns[i] > 3) {
2048628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2049628f99d7SShri Abhyankar       ++j;
2050628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2051628f99d7SShri Abhyankar     } else {
2052628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i];
2053628f99d7SShri Abhyankar     }
2054628f99d7SShri Abhyankar   }
2055628f99d7SShri Abhyankar   /* Use the correct node_max */
2056628f99d7SShri Abhyankar   node_max = j;
2057628f99d7SShri Abhyankar 
2058628f99d7SShri Abhyankar   /* Now reorder the inode info based on mat re-ordering info */
2059628f99d7SShri Abhyankar   /* First create a row -> inode_size_array_index map */
20609566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2061628f99d7SShri Abhyankar   for (i = 0, row = 0; i < node_max; i++) {
2062628f99d7SShri Abhyankar     nodesz = tmp_vec1[i];
2063ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2064628f99d7SShri Abhyankar   }
2065628f99d7SShri Abhyankar   /* Using nsmap, create a reordered ns structure */
2066628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; i++) {
2067628f99d7SShri Abhyankar     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2068628f99d7SShri Abhyankar     tmp_vec2[i] = nodesz;
2069628f99d7SShri Abhyankar     j += nodesz;
2070628f99d7SShri Abhyankar   }
20719566063dSJacob Faibussowitsch   PetscCall(PetscFree2(nsmap, tmp_vec1));
2072628f99d7SShri Abhyankar   /* Now use the correct ns */
2073628f99d7SShri Abhyankar   ns = tmp_vec2;
2074628f99d7SShri Abhyankar 
2075628f99d7SShri Abhyankar   do {
207607b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
2077628f99d7SShri Abhyankar     /* Now loop over each block-row, and do the factorization */
2078628f99d7SShri Abhyankar     for (i = 0, row = 0; i < node_max; i++) {
2079628f99d7SShri Abhyankar       nodesz = ns[i];
2080628f99d7SShri Abhyankar       nz     = bi[row + 1] - bi[row];
2081628f99d7SShri Abhyankar       bjtmp  = bj + bi[row];
2082628f99d7SShri Abhyankar 
2083628f99d7SShri Abhyankar       switch (nodesz) {
2084628f99d7SShri Abhyankar       case 1:
2085628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2086628f99d7SShri Abhyankar           idx         = bjtmp[j];
2087628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2088628f99d7SShri Abhyankar         }
2089628f99d7SShri Abhyankar 
2090628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2091628f99d7SShri Abhyankar         idx    = r[row];
2092628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2093628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2094628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2095628f99d7SShri Abhyankar 
2096628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2097628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2098628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2099628f99d7SShri Abhyankar         }
2100628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2101628f99d7SShri Abhyankar 
2102628f99d7SShri Abhyankar         prow = *bjtmp++;
2103628f99d7SShri Abhyankar         while (prow < row) {
2104628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2105628f99d7SShri Abhyankar           if (*pc1 != 0.0) {
2106628f99d7SShri Abhyankar             pv     = ba + bd[prow];
2107628f99d7SShri Abhyankar             pj     = nbj + bd[prow];
2108628f99d7SShri Abhyankar             mul1   = *pc1 * *pv++;
2109628f99d7SShri Abhyankar             *pc1   = mul1;
2110628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
21119566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2112628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2113628f99d7SShri Abhyankar               tmp = pv[j];
2114628f99d7SShri Abhyankar               idx = pj[j];
2115628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2116628f99d7SShri Abhyankar             }
2117628f99d7SShri Abhyankar           }
2118628f99d7SShri Abhyankar           prow = *bjtmp++;
2119628f99d7SShri Abhyankar         }
2120628f99d7SShri Abhyankar         pj  = bj + bi[row];
2121628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2122628f99d7SShri Abhyankar 
2123628f99d7SShri Abhyankar         sctx.pv     = rtmp11[row];
2124628f99d7SShri Abhyankar         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2125628f99d7SShri Abhyankar         rs          = 0.0;
2126628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2127628f99d7SShri Abhyankar           idx    = pj[j];
2128628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2129628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2130628f99d7SShri Abhyankar         }
2131628f99d7SShri Abhyankar         sctx.rs = rs;
21329566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
213307b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2134628f99d7SShri Abhyankar         break;
2135628f99d7SShri Abhyankar 
2136628f99d7SShri Abhyankar       case 2:
2137628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2138628f99d7SShri Abhyankar           idx         = bjtmp[j];
2139628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2140628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2141628f99d7SShri Abhyankar         }
2142628f99d7SShri Abhyankar 
2143628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2144628f99d7SShri Abhyankar         idx    = r[row];
2145628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2146628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2147628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2148628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2149628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2150628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2151628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2152628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2153628f99d7SShri Abhyankar         }
2154628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2155628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2156628f99d7SShri Abhyankar 
2157628f99d7SShri Abhyankar         prow = *bjtmp++;
2158628f99d7SShri Abhyankar         while (prow < row) {
2159628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2160628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2161628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0) {
2162628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2163628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2164628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2165628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2166628f99d7SShri Abhyankar             ++pv;
2167628f99d7SShri Abhyankar             *pc1 = mul1;
2168628f99d7SShri Abhyankar             *pc2 = mul2;
2169628f99d7SShri Abhyankar 
2170628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2171628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2172628f99d7SShri Abhyankar               tmp = pv[j];
2173628f99d7SShri Abhyankar               idx = pj[j];
2174628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2175628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2176628f99d7SShri Abhyankar             }
21779566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2178628f99d7SShri Abhyankar           }
2179628f99d7SShri Abhyankar           prow = *bjtmp++;
2180628f99d7SShri Abhyankar         }
2181628f99d7SShri Abhyankar 
2182628f99d7SShri Abhyankar         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2183628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2184628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2185628f99d7SShri Abhyankar 
2186628f99d7SShri Abhyankar         sctx.pv = *pc1;
2187628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2188628f99d7SShri Abhyankar         rs      = 0.0;
2189628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2190628f99d7SShri Abhyankar           idx = pj[j];
2191628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2192628f99d7SShri Abhyankar         }
2193628f99d7SShri Abhyankar         sctx.rs = rs;
21949566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
219507b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2196628f99d7SShri Abhyankar 
2197628f99d7SShri Abhyankar         if (*pc2 != 0.0) {
2198628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2199628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2200628f99d7SShri Abhyankar           *pc2   = mul2;
2201628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2202628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2203628f99d7SShri Abhyankar             idx = pj[j];
2204628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2205628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2206628f99d7SShri Abhyankar           }
22079566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2208628f99d7SShri Abhyankar         }
2209628f99d7SShri Abhyankar 
2210628f99d7SShri Abhyankar         pj  = bj + bi[row];
2211628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2212628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2213628f99d7SShri Abhyankar 
2214628f99d7SShri Abhyankar         sctx.pv         = rtmp22[row + 1];
2215628f99d7SShri Abhyankar         rs              = 0.0;
2216628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2217628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2218628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2219628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2220628f99d7SShri Abhyankar           idx    = pj[j];
2221628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2222628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2223628f99d7SShri Abhyankar           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2224628f99d7SShri Abhyankar         }
2225628f99d7SShri Abhyankar         sctx.rs = rs;
22269566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
222707b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2228628f99d7SShri Abhyankar         break;
2229628f99d7SShri Abhyankar 
2230628f99d7SShri Abhyankar       case 3:
2231628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2232628f99d7SShri Abhyankar           idx         = bjtmp[j];
2233628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2234628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2235628f99d7SShri Abhyankar           rtmp33[idx] = 0.0;
2236628f99d7SShri Abhyankar         }
2237628f99d7SShri Abhyankar         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2238628f99d7SShri Abhyankar         idx    = r[row];
2239628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2240628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2241628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2242628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2243628f99d7SShri Abhyankar         v3     = aa + ai[idx + 2];
2244628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2245628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2246628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2247628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2248628f99d7SShri Abhyankar           rtmp33[idx] = v3[j];
2249628f99d7SShri Abhyankar         }
2250628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2251628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2252628f99d7SShri Abhyankar         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2253628f99d7SShri Abhyankar 
2254628f99d7SShri Abhyankar         /* loop over all pivot row blocks above this row block */
2255628f99d7SShri Abhyankar         prow = *bjtmp++;
2256628f99d7SShri Abhyankar         while (prow < row) {
2257628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2258628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2259628f99d7SShri Abhyankar           pc3 = rtmp33 + prow;
2260628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2261628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2262628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2263628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2264628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2265628f99d7SShri Abhyankar             mul3 = *pc3 * *pv;
2266628f99d7SShri Abhyankar             ++pv;
2267628f99d7SShri Abhyankar             *pc1 = mul1;
2268628f99d7SShri Abhyankar             *pc2 = mul2;
2269628f99d7SShri Abhyankar             *pc3 = mul3;
2270628f99d7SShri Abhyankar 
2271628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2272628f99d7SShri Abhyankar             /* update this row based on pivot row */
2273628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2274628f99d7SShri Abhyankar               tmp = pv[j];
2275628f99d7SShri Abhyankar               idx = pj[j];
2276628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2277628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2278628f99d7SShri Abhyankar               rtmp33[idx] -= mul3 * tmp;
2279628f99d7SShri Abhyankar             }
22809566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2281628f99d7SShri Abhyankar           }
2282628f99d7SShri Abhyankar           prow = *bjtmp++;
2283628f99d7SShri Abhyankar         }
2284628f99d7SShri Abhyankar 
2285628f99d7SShri Abhyankar         /* Now take care of diagonal 3x3 block in this set of rows */
2286628f99d7SShri Abhyankar         /* note: prow = row here */
2287628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2288628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2289628f99d7SShri Abhyankar         pc3 = rtmp33 + prow;
2290628f99d7SShri Abhyankar 
2291628f99d7SShri Abhyankar         sctx.pv = *pc1;
2292628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2293628f99d7SShri Abhyankar         rs      = 0.0;
2294628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2295628f99d7SShri Abhyankar           idx = pj[j];
2296628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2297628f99d7SShri Abhyankar         }
2298628f99d7SShri Abhyankar         sctx.rs = rs;
22999566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
230007b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2301628f99d7SShri Abhyankar 
2302628f99d7SShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0) {
2303628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1);
2304628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc1);
2305628f99d7SShri Abhyankar           *pc2   = mul2;
2306628f99d7SShri Abhyankar           *pc3   = mul3;
2307628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2308628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2309628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2310628f99d7SShri Abhyankar             idx = pj[j];
2311628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2312628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2313628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2314628f99d7SShri Abhyankar           }
23159566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2316628f99d7SShri Abhyankar         }
2317628f99d7SShri Abhyankar         ++prow;
2318628f99d7SShri Abhyankar 
2319628f99d7SShri Abhyankar         pc2     = rtmp22 + prow;
2320628f99d7SShri Abhyankar         pc3     = rtmp33 + prow;
2321628f99d7SShri Abhyankar         sctx.pv = *pc2;
2322628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2323628f99d7SShri Abhyankar         rs      = 0.0;
2324628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2325628f99d7SShri Abhyankar           idx = pj[j];
2326628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2327628f99d7SShri Abhyankar         }
2328628f99d7SShri Abhyankar         sctx.rs = rs;
23299566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
233007b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2331628f99d7SShri Abhyankar 
2332628f99d7SShri Abhyankar         if (*pc3 != 0.0) {
2333628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc2);
2334628f99d7SShri Abhyankar           *pc3   = mul3;
2335628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2336628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2337628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2338628f99d7SShri Abhyankar             idx = pj[j];
2339628f99d7SShri Abhyankar             tmp = rtmp22[idx];
2340628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2341628f99d7SShri Abhyankar           }
23429566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2343628f99d7SShri Abhyankar         }
2344628f99d7SShri Abhyankar 
2345628f99d7SShri Abhyankar         pj  = bj + bi[row];
2346628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2347628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2348628f99d7SShri Abhyankar         pc3 = ba + bi[row + 2];
2349628f99d7SShri Abhyankar 
2350628f99d7SShri Abhyankar         sctx.pv         = rtmp33[row + 2];
2351628f99d7SShri Abhyankar         rs              = 0.0;
2352628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2353628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2354628f99d7SShri Abhyankar         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2355628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2356628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2357628f99d7SShri Abhyankar           idx    = pj[j];
2358628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2359628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2360628f99d7SShri Abhyankar           pc3[j] = rtmp33[idx];
2361628f99d7SShri Abhyankar           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2362628f99d7SShri Abhyankar         }
2363628f99d7SShri Abhyankar 
2364628f99d7SShri Abhyankar         sctx.rs = rs;
23659566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
236607b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2367628f99d7SShri Abhyankar         break;
2368628f99d7SShri Abhyankar 
2369d71ae5a4SJacob Faibussowitsch       default:
2370d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2371628f99d7SShri Abhyankar       }
2372628f99d7SShri Abhyankar       row += nodesz; /* Update the row */
2373628f99d7SShri Abhyankar     }
2374628f99d7SShri Abhyankar   endofwhile:;
237507b50cabSHong Zhang   } while (sctx.newshift);
23769566063dSJacob Faibussowitsch   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
23779566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
23789566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
23799566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
23809566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &c));
23812205254eSKarl Rupp 
2382d3ac4fa3SBarry Smith   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2383628f99d7SShri Abhyankar   /* do not set solve add, since MatSolve_Inode + Add is faster */
2384628f99d7SShri Abhyankar   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2385628f99d7SShri Abhyankar   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2386628f99d7SShri Abhyankar   C->assembled              = PETSC_TRUE;
2387628f99d7SShri Abhyankar   C->preallocated           = PETSC_TRUE;
2388628f99d7SShri Abhyankar   if (sctx.nshift) {
2389f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
23909566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2391f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
23929566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2393628f99d7SShri Abhyankar     }
2394628f99d7SShri Abhyankar   }
23959566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
23969566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCheckInode(C));
23973ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2398628f99d7SShri Abhyankar }
2399ff6a9541SJacob Faibussowitsch #endif
2400628f99d7SShri Abhyankar 
2401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2402d71ae5a4SJacob Faibussowitsch {
2403019b515eSShri Abhyankar   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2404019b515eSShri Abhyankar   IS                 iscol = a->col, isrow = a->row;
2405019b515eSShri Abhyankar   const PetscInt    *r, *c, *rout, *cout;
2406*4d12350bSJunchao Zhang   PetscInt           i, j;
24078758e1faSBarry Smith   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
24088758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2409019b515eSShri Abhyankar   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2410019b515eSShri Abhyankar   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2411019b515eSShri Abhyankar   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2412019b515eSShri Abhyankar   const PetscScalar *b;
2413019b515eSShri Abhyankar 
2414019b515eSShri Abhyankar   PetscFunctionBegin;
2415*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2416019b515eSShri Abhyankar   node_max = a->inode.node_count;
2417*4d12350bSJunchao Zhang   ns       = a->inode.size_csr; /* Node Size array */
2418019b515eSShri Abhyankar 
24199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
24209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
2421019b515eSShri Abhyankar   tmp = a->solve_work;
2422019b515eSShri Abhyankar 
24239371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
24249371c9d4SSatish Balay   r = rout;
24259371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
24269371c9d4SSatish Balay   c = cout;
2427019b515eSShri Abhyankar 
2428019b515eSShri Abhyankar   /* forward solve the lower triangular */
2429019b515eSShri Abhyankar   tmps = tmp;
2430019b515eSShri Abhyankar   aa   = a_a;
2431019b515eSShri Abhyankar   aj   = a_j;
2432019b515eSShri Abhyankar   ad   = a->diag;
2433019b515eSShri Abhyankar 
2434*4d12350bSJunchao Zhang   for (i = 0; i < node_max; ++i) {
2435*4d12350bSJunchao Zhang     row = ns[i];
2436*4d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
2437019b515eSShri Abhyankar     aii = ai[row];
2438019b515eSShri Abhyankar     v1  = aa + aii;
2439019b515eSShri Abhyankar     vi  = aj + aii;
2440019b515eSShri Abhyankar     nz  = ai[row + 1] - ai[row];
2441019b515eSShri Abhyankar 
244298991853SShri Abhyankar     if (i < node_max - 1) {
244398991853SShri Abhyankar       /* Prefetch the indices for the next block */
244450d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
244598991853SShri Abhyankar       /* Prefetch the data for the next block */
2446*4d12350bSJunchao Zhang       PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
244798991853SShri Abhyankar     }
244898991853SShri Abhyankar 
2449019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2450019b515eSShri Abhyankar     case 1:
2451019b515eSShri Abhyankar       sum1 = b[r[row]];
2452019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2453019b515eSShri Abhyankar         i0   = vi[j];
2454019b515eSShri Abhyankar         i1   = vi[j + 1];
2455019b515eSShri Abhyankar         tmp0 = tmps[i0];
2456019b515eSShri Abhyankar         tmp1 = tmps[i1];
2457019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2458019b515eSShri Abhyankar       }
2459019b515eSShri Abhyankar       if (j == nz - 1) {
2460019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2461019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2462019b515eSShri Abhyankar       }
2463019b515eSShri Abhyankar       tmp[row++] = sum1;
2464019b515eSShri Abhyankar       break;
2465019b515eSShri Abhyankar     case 2:
2466019b515eSShri Abhyankar       sum1 = b[r[row]];
2467019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2468019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2469019b515eSShri Abhyankar 
2470019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2471019b515eSShri Abhyankar         i0   = vi[j];
2472019b515eSShri Abhyankar         i1   = vi[j + 1];
2473019b515eSShri Abhyankar         tmp0 = tmps[i0];
2474019b515eSShri Abhyankar         tmp1 = tmps[i1];
2475019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2476019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2477019b515eSShri Abhyankar       }
2478019b515eSShri Abhyankar       if (j == nz - 1) {
2479019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2480019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2481019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2482019b515eSShri Abhyankar       }
2483019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2484019b515eSShri Abhyankar       tmp[row++] = sum1;
2485019b515eSShri Abhyankar       tmp[row++] = sum2;
2486019b515eSShri Abhyankar       break;
2487019b515eSShri Abhyankar     case 3:
2488019b515eSShri Abhyankar       sum1 = b[r[row]];
2489019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2490019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2491019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2492019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2493019b515eSShri Abhyankar 
2494019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2495019b515eSShri Abhyankar         i0   = vi[j];
2496019b515eSShri Abhyankar         i1   = vi[j + 1];
2497019b515eSShri Abhyankar         tmp0 = tmps[i0];
2498019b515eSShri Abhyankar         tmp1 = tmps[i1];
2499019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2500019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2501019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2502019b515eSShri Abhyankar       }
2503019b515eSShri Abhyankar       if (j == nz - 1) {
2504019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2505019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2506019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2507019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2508019b515eSShri Abhyankar       }
2509019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2510019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2511019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2512019b515eSShri Abhyankar       tmp[row++] = sum1;
2513019b515eSShri Abhyankar       tmp[row++] = sum2;
2514019b515eSShri Abhyankar       tmp[row++] = sum3;
2515019b515eSShri Abhyankar       break;
2516019b515eSShri Abhyankar 
2517019b515eSShri Abhyankar     case 4:
2518019b515eSShri Abhyankar       sum1 = b[r[row]];
2519019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2520019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2521019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2522019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2523019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2524019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2525019b515eSShri Abhyankar 
2526019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2527019b515eSShri Abhyankar         i0   = vi[j];
2528019b515eSShri Abhyankar         i1   = vi[j + 1];
2529019b515eSShri Abhyankar         tmp0 = tmps[i0];
2530019b515eSShri Abhyankar         tmp1 = tmps[i1];
2531019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2532019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2533019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2534019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2535019b515eSShri Abhyankar       }
2536019b515eSShri Abhyankar       if (j == nz - 1) {
2537019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2538019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2539019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2540019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2541019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2542019b515eSShri Abhyankar       }
2543019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2544019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2545019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2546019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2547019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2548019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2549019b515eSShri Abhyankar 
2550019b515eSShri Abhyankar       tmp[row++] = sum1;
2551019b515eSShri Abhyankar       tmp[row++] = sum2;
2552019b515eSShri Abhyankar       tmp[row++] = sum3;
2553019b515eSShri Abhyankar       tmp[row++] = sum4;
2554019b515eSShri Abhyankar       break;
2555019b515eSShri Abhyankar     case 5:
2556019b515eSShri Abhyankar       sum1 = b[r[row]];
2557019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2558019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2559019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2560019b515eSShri Abhyankar       sum5 = b[r[row + 4]];
2561019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2562019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2563019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2564019b515eSShri Abhyankar       v5   = aa + ai[row + 4];
2565019b515eSShri Abhyankar 
2566019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2567019b515eSShri Abhyankar         i0   = vi[j];
2568019b515eSShri Abhyankar         i1   = vi[j + 1];
2569019b515eSShri Abhyankar         tmp0 = tmps[i0];
2570019b515eSShri Abhyankar         tmp1 = tmps[i1];
2571019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2572019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2573019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2574019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2575019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2576019b515eSShri Abhyankar       }
2577019b515eSShri Abhyankar       if (j == nz - 1) {
2578019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2579019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2580019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2581019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2582019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2583019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0;
2584019b515eSShri Abhyankar       }
2585019b515eSShri Abhyankar 
2586019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2587019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2588019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2589019b515eSShri Abhyankar       sum5 -= v5[nz] * sum1;
2590019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2591019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2592019b515eSShri Abhyankar       sum5 -= v5[nz + 1] * sum2;
2593019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2594019b515eSShri Abhyankar       sum5 -= v5[nz + 2] * sum3;
2595019b515eSShri Abhyankar       sum5 -= v5[nz + 3] * sum4;
2596019b515eSShri Abhyankar 
2597019b515eSShri Abhyankar       tmp[row++] = sum1;
2598019b515eSShri Abhyankar       tmp[row++] = sum2;
2599019b515eSShri Abhyankar       tmp[row++] = sum3;
2600019b515eSShri Abhyankar       tmp[row++] = sum4;
2601019b515eSShri Abhyankar       tmp[row++] = sum5;
2602019b515eSShri Abhyankar       break;
2603d71ae5a4SJacob Faibussowitsch     default:
2604d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2605019b515eSShri Abhyankar     }
2606019b515eSShri Abhyankar   }
2607019b515eSShri Abhyankar   /* backward solve the upper triangular */
2608*4d12350bSJunchao Zhang   for (i = node_max - 1; i >= 0; i--) {
2609*4d12350bSJunchao Zhang     row = ns[i + 1] - 1;
2610*4d12350bSJunchao Zhang     nsz = ns[i + 1] - ns[i];
2611019b515eSShri Abhyankar     aii = ad[row + 1] + 1;
2612019b515eSShri Abhyankar     v1  = aa + aii;
2613019b515eSShri Abhyankar     vi  = aj + aii;
2614019b515eSShri Abhyankar     nz  = ad[row] - ad[row + 1] - 1;
261598991853SShri Abhyankar 
261698991853SShri Abhyankar     if (i > 0) {
261798991853SShri Abhyankar       /* Prefetch the indices for the next block */
261850d8bf02SJed Brown       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
261998991853SShri Abhyankar       /* Prefetch the data for the next block */
2620*4d12350bSJunchao Zhang       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
262198991853SShri Abhyankar     }
262298991853SShri Abhyankar 
2623019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2624019b515eSShri Abhyankar     case 1:
2625019b515eSShri Abhyankar       sum1 = tmp[row];
2626019b515eSShri Abhyankar 
2627019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2628019b515eSShri Abhyankar         i0   = vi[j];
2629019b515eSShri Abhyankar         i1   = vi[j + 1];
2630019b515eSShri Abhyankar         tmp0 = tmps[i0];
2631019b515eSShri Abhyankar         tmp1 = tmps[i1];
2632019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2633019b515eSShri Abhyankar       }
2634019b515eSShri Abhyankar       if (j == nz - 1) {
2635019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2636019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2637019b515eSShri Abhyankar       }
26389371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum1 * v1[nz];
26399371c9d4SSatish Balay       row--;
2640019b515eSShri Abhyankar       break;
2641019b515eSShri Abhyankar     case 2:
2642019b515eSShri Abhyankar       sum1 = tmp[row];
2643019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2644019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2645019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2646019b515eSShri Abhyankar         i0   = vi[j];
2647019b515eSShri Abhyankar         i1   = vi[j + 1];
2648019b515eSShri Abhyankar         tmp0 = tmps[i0];
2649019b515eSShri Abhyankar         tmp1 = tmps[i1];
2650019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2651019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2652019b515eSShri Abhyankar       }
2653019b515eSShri Abhyankar       if (j == nz - 1) {
2654019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2655019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2656019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2657019b515eSShri Abhyankar       }
2658019b515eSShri Abhyankar 
26599371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26609371c9d4SSatish Balay       row--;
2661019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
26629371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26639371c9d4SSatish Balay       row--;
2664019b515eSShri Abhyankar       break;
2665019b515eSShri Abhyankar     case 3:
2666019b515eSShri Abhyankar       sum1 = tmp[row];
2667019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2668019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2669019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2670019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2671019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2672019b515eSShri Abhyankar         i0   = vi[j];
2673019b515eSShri Abhyankar         i1   = vi[j + 1];
2674019b515eSShri Abhyankar         tmp0 = tmps[i0];
2675019b515eSShri Abhyankar         tmp1 = tmps[i1];
2676019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2677019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2678019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2679019b515eSShri Abhyankar       }
2680019b515eSShri Abhyankar       if (j == nz - 1) {
2681019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2682019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2683019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2684019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2685019b515eSShri Abhyankar       }
26869371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26879371c9d4SSatish Balay       row--;
2688019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2689019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
26909371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26919371c9d4SSatish Balay       row--;
2692019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
26939371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
26949371c9d4SSatish Balay       row--;
2695019b515eSShri Abhyankar 
2696019b515eSShri Abhyankar       break;
2697019b515eSShri Abhyankar     case 4:
2698019b515eSShri Abhyankar       sum1 = tmp[row];
2699019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2700019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2701019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2702019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2703019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2704019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2705019b515eSShri Abhyankar 
2706019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2707019b515eSShri Abhyankar         i0   = vi[j];
2708019b515eSShri Abhyankar         i1   = vi[j + 1];
2709019b515eSShri Abhyankar         tmp0 = tmps[i0];
2710019b515eSShri Abhyankar         tmp1 = tmps[i1];
2711019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2712019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2713019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2714019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2715019b515eSShri Abhyankar       }
2716019b515eSShri Abhyankar       if (j == nz - 1) {
2717019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2718019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2719019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2720019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2721019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2722019b515eSShri Abhyankar       }
2723019b515eSShri Abhyankar 
27249371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27259371c9d4SSatish Balay       row--;
2726019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2727019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2728019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
27299371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27309371c9d4SSatish Balay       row--;
2731019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2732019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
27339371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27349371c9d4SSatish Balay       row--;
2735019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
27369371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27379371c9d4SSatish Balay       row--;
2738019b515eSShri Abhyankar       break;
2739019b515eSShri Abhyankar     case 5:
2740019b515eSShri Abhyankar       sum1 = tmp[row];
2741019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2742019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2743019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2744019b515eSShri Abhyankar       sum5 = tmp[row - 4];
2745019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2746019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2747019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2748019b515eSShri Abhyankar       v5   = aa + ad[row - 3] + 1;
2749019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2750019b515eSShri Abhyankar         i0   = vi[j];
2751019b515eSShri Abhyankar         i1   = vi[j + 1];
2752019b515eSShri Abhyankar         tmp0 = tmps[i0];
2753019b515eSShri Abhyankar         tmp1 = tmps[i1];
2754019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2755019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2756019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2757019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2758019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2759019b515eSShri Abhyankar       }
2760019b515eSShri Abhyankar       if (j == nz - 1) {
2761019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2762019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2763019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2764019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2765019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2766019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0;
2767019b515eSShri Abhyankar       }
2768019b515eSShri Abhyankar 
27699371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27709371c9d4SSatish Balay       row--;
2771019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2772019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2773019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
2774019b515eSShri Abhyankar       sum5 -= v5[3] * tmp0;
27759371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27769371c9d4SSatish Balay       row--;
2777019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2778019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
2779019b515eSShri Abhyankar       sum5 -= v5[2] * tmp0;
27809371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27819371c9d4SSatish Balay       row--;
2782019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
2783019b515eSShri Abhyankar       sum5 -= v5[1] * tmp0;
27849371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27859371c9d4SSatish Balay       row--;
2786019b515eSShri Abhyankar       sum5 -= v5[0] * tmp0;
27879371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
27889371c9d4SSatish Balay       row--;
2789019b515eSShri Abhyankar       break;
2790d71ae5a4SJacob Faibussowitsch     default:
2791d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2792019b515eSShri Abhyankar     }
2793019b515eSShri Abhyankar   }
27949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
27959566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
27969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
27979566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
27989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
27993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
2800019b515eSShri Abhyankar }
2801019b515eSShri Abhyankar 
28024c1414c8SBarry Smith /*
28034c1414c8SBarry Smith      Makes a longer coloring[] array and calls the usual code with that
28044c1414c8SBarry Smith */
280566976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2806d71ae5a4SJacob Faibussowitsch {
28074c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2808*4d12350bSJunchao Zhang   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
28094c1414c8SBarry Smith   PetscInt        *colorused, i;
28104c1414c8SBarry Smith   ISColoringValue *newcolor;
28114c1414c8SBarry Smith 
28124c1414c8SBarry Smith   PetscFunctionBegin;
2813*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
28149566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &newcolor));
28154c1414c8SBarry Smith   /* loop over inodes, marking a color for each column*/
28164c1414c8SBarry Smith   row = 0;
28174c1414c8SBarry Smith   for (i = 0; i < m; i++) {
2818*4d12350bSJunchao Zhang     for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
28194c1414c8SBarry Smith   }
28204c1414c8SBarry Smith 
28214c1414c8SBarry Smith   /* eliminate unneeded colors */
28229566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2823ad540459SPierre Jolivet   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
28244c1414c8SBarry Smith 
2825ad540459SPierre Jolivet   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
28264c1414c8SBarry Smith   ncolors = colorused[5 * ncolors - 1];
28276497c311SBarry Smith   for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
28289566063dSJacob Faibussowitsch   PetscCall(PetscFree(colorused));
28299566063dSJacob Faibussowitsch   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
28309566063dSJacob Faibussowitsch   PetscCall(PetscFree(coloring));
28313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
28324c1414c8SBarry Smith }
28334c1414c8SBarry Smith 
2834af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
28352af78befSBarry Smith 
2836d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2837d71ae5a4SJacob Faibussowitsch {
28382af78befSBarry Smith   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
28397aaeff0aSMatthew G. Knepley   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
28405850ef23SBarry Smith   MatScalar         *ibdiag, *bdiag, work[25], *t;
2841a8b09249SBarry Smith   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
28427aaeff0aSMatthew G. Knepley   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
28435850ef23SBarry Smith   const PetscScalar *xb, *b;
28447b6c816cSBarry Smith   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
2845*4d12350bSJunchao Zhang   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz;
28468758e1faSBarry Smith   PetscInt           sz, k, ipvt[5];
28477b6c816cSBarry Smith   PetscBool          allowzeropivot, zeropivotdetected;
2848*4d12350bSJunchao Zhang   const PetscInt    *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i;
28492af78befSBarry Smith 
28502af78befSBarry Smith   PetscFunctionBegin;
2851a455e926SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
2852*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
285308401ef6SPierre Jolivet   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
285408401ef6SPierre Jolivet   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
28552af78befSBarry Smith 
285671f1c65dSBarry Smith   if (!a->inode.ibdiagvalid) {
28572af78befSBarry Smith     if (!a->inode.ibdiag) {
28582af78befSBarry Smith       /* calculate space needed for diagonal blocks */
2859*4d12350bSJunchao Zhang       for (i = 0; i < m; i++) {
2860*4d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
2861*4d12350bSJunchao Zhang         cnt += nodesz * nodesz;
2862*4d12350bSJunchao Zhang       }
2863f0d39aaaSBarry Smith       a->inode.bdiagsize = cnt;
28642205254eSKarl Rupp 
28659566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
286671f1c65dSBarry Smith     }
286771f1c65dSBarry Smith 
286871f1c65dSBarry Smith     /* copy over the diagonal blocks and invert them */
28692af78befSBarry Smith     ibdiag = a->inode.ibdiag;
28702af78befSBarry Smith     bdiag  = a->inode.bdiag;
28712af78befSBarry Smith     cnt    = 0;
28722af78befSBarry Smith     for (i = 0, row = 0; i < m; i++) {
2873*4d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
2874*4d12350bSJunchao Zhang       for (j = 0; j < nodesz; j++) {
2875*4d12350bSJunchao Zhang         for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
28762af78befSBarry Smith       }
2877*4d12350bSJunchao Zhang       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
28782af78befSBarry Smith 
2879*4d12350bSJunchao Zhang       switch (nodesz) {
28802af78befSBarry Smith       case 1:
28812af78befSBarry Smith         /* Create matrix data structure */
28828e0e2a9aSHong Zhang         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
28838e0e2a9aSHong Zhang           if (allowzeropivot) {
28847b6c816cSBarry Smith             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28857b6c816cSBarry Smith             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
28867b6c816cSBarry Smith             A->factorerror_zeropivot_row   = row;
28879566063dSJacob Faibussowitsch             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
288898921bdaSJacob Faibussowitsch           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
28898e0e2a9aSHong Zhang         }
289064c62002SMatthew Knepley         ibdiag[cnt] = 1.0 / ibdiag[cnt];
28912af78befSBarry Smith         break;
28922af78befSBarry Smith       case 2:
28939566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28947b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28952af78befSBarry Smith         break;
28962af78befSBarry Smith       case 3:
28979566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28987b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28992af78befSBarry Smith         break;
29002af78befSBarry Smith       case 4:
29019566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
29027b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
29032af78befSBarry Smith         break;
29042af78befSBarry Smith       case 5:
29059566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
29067b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
29072af78befSBarry Smith         break;
2908d71ae5a4SJacob Faibussowitsch       default:
2909*4d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
29102af78befSBarry Smith       }
2911*4d12350bSJunchao Zhang       cnt += nodesz * nodesz;
2912*4d12350bSJunchao Zhang       row += nodesz;
29132af78befSBarry Smith     }
291471f1c65dSBarry Smith     a->inode.ibdiagvalid = PETSC_TRUE;
29152af78befSBarry Smith   }
29162af78befSBarry Smith   ibdiag = a->inode.ibdiag;
29172af78befSBarry Smith   bdiag  = a->inode.bdiag;
29185850ef23SBarry Smith   t      = a->inode.ssor_work;
29192af78befSBarry Smith 
29209566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
29219566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
29225850ef23SBarry Smith   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
29235850ef23SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
29242af78befSBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
29258862d2efSBarry Smith       for (i = 0, row = 0; i < m; i++) {
29268862d2efSBarry Smith         sz  = diag[row] - ii[row];
29278862d2efSBarry Smith         v1  = a->a + ii[row];
29288862d2efSBarry Smith         idx = a->j + ii[row];
29298862d2efSBarry Smith 
29304108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
2931*4d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
2932*4d12350bSJunchao Zhang         switch (nodesz) {
29338862d2efSBarry Smith         case 1:
29348862d2efSBarry Smith 
29358862d2efSBarry Smith           sum1 = b[row];
29368862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
29378862d2efSBarry Smith             i1 = idx[0];
29388862d2efSBarry Smith             i2 = idx[1];
29398862d2efSBarry Smith             idx += 2;
29408862d2efSBarry Smith             tmp0 = x[i1];
29418862d2efSBarry Smith             tmp1 = x[i2];
29429371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29439371c9d4SSatish Balay             v1 += 2;
29448862d2efSBarry Smith           }
29458862d2efSBarry Smith 
29468862d2efSBarry Smith           if (n == sz - 1) {
2947f0d39aaaSBarry Smith             tmp0 = x[*idx];
2948f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
29498862d2efSBarry Smith           }
29505850ef23SBarry Smith           t[row]   = sum1;
29518862d2efSBarry Smith           x[row++] = sum1 * (*ibdiag++);
29528862d2efSBarry Smith           break;
2953f0d39aaaSBarry Smith         case 2:
2954f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2955f0d39aaaSBarry Smith           sum1 = b[row];
2956f0d39aaaSBarry Smith           sum2 = b[row + 1];
2957f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2958f0d39aaaSBarry Smith             i1 = idx[0];
2959f0d39aaaSBarry Smith             i2 = idx[1];
2960f0d39aaaSBarry Smith             idx += 2;
2961f0d39aaaSBarry Smith             tmp0 = x[i1];
2962f0d39aaaSBarry Smith             tmp1 = x[i2];
29639371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29649371c9d4SSatish Balay             v1 += 2;
29659371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29669371c9d4SSatish Balay             v2 += 2;
2967f0d39aaaSBarry Smith           }
2968f0d39aaaSBarry Smith 
2969f0d39aaaSBarry Smith           if (n == sz - 1) {
2970f0d39aaaSBarry Smith             tmp0 = x[*idx];
2971f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2972f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2973f0d39aaaSBarry Smith           }
29745850ef23SBarry Smith           t[row]     = sum1;
29755850ef23SBarry Smith           t[row + 1] = sum2;
2976f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2977f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2978f0d39aaaSBarry Smith           ibdiag += 4;
2979f0d39aaaSBarry Smith           break;
2980f0d39aaaSBarry Smith         case 3:
2981f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2982f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2983f0d39aaaSBarry Smith           sum1 = b[row];
2984f0d39aaaSBarry Smith           sum2 = b[row + 1];
2985f0d39aaaSBarry Smith           sum3 = b[row + 2];
2986f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2987f0d39aaaSBarry Smith             i1 = idx[0];
2988f0d39aaaSBarry Smith             i2 = idx[1];
2989f0d39aaaSBarry Smith             idx += 2;
2990f0d39aaaSBarry Smith             tmp0 = x[i1];
2991f0d39aaaSBarry Smith             tmp1 = x[i2];
29929371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29939371c9d4SSatish Balay             v1 += 2;
29949371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29959371c9d4SSatish Balay             v2 += 2;
29969371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
29979371c9d4SSatish Balay             v3 += 2;
2998f0d39aaaSBarry Smith           }
2999f0d39aaaSBarry Smith 
3000f0d39aaaSBarry Smith           if (n == sz - 1) {
3001f0d39aaaSBarry Smith             tmp0 = x[*idx];
3002f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3003f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3004f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3005f0d39aaaSBarry Smith           }
30065850ef23SBarry Smith           t[row]     = sum1;
30075850ef23SBarry Smith           t[row + 1] = sum2;
30085850ef23SBarry Smith           t[row + 2] = sum3;
3009f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3010f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3011f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
3012f0d39aaaSBarry Smith           ibdiag += 9;
3013f0d39aaaSBarry Smith           break;
3014f0d39aaaSBarry Smith         case 4:
3015f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3016f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3017f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3018f0d39aaaSBarry Smith           sum1 = b[row];
3019f0d39aaaSBarry Smith           sum2 = b[row + 1];
3020f0d39aaaSBarry Smith           sum3 = b[row + 2];
3021f0d39aaaSBarry Smith           sum4 = b[row + 3];
3022f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3023f0d39aaaSBarry Smith             i1 = idx[0];
3024f0d39aaaSBarry Smith             i2 = idx[1];
3025f0d39aaaSBarry Smith             idx += 2;
3026f0d39aaaSBarry Smith             tmp0 = x[i1];
3027f0d39aaaSBarry Smith             tmp1 = x[i2];
30289371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30299371c9d4SSatish Balay             v1 += 2;
30309371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30319371c9d4SSatish Balay             v2 += 2;
30329371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30339371c9d4SSatish Balay             v3 += 2;
30349371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30359371c9d4SSatish Balay             v4 += 2;
3036f0d39aaaSBarry Smith           }
3037f0d39aaaSBarry Smith 
3038f0d39aaaSBarry Smith           if (n == sz - 1) {
3039f0d39aaaSBarry Smith             tmp0 = x[*idx];
3040f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3041f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3042f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3043f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3044f0d39aaaSBarry Smith           }
30455850ef23SBarry Smith           t[row]     = sum1;
30465850ef23SBarry Smith           t[row + 1] = sum2;
30475850ef23SBarry Smith           t[row + 2] = sum3;
30485850ef23SBarry Smith           t[row + 3] = sum4;
3049f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3050f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3051f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3052f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3053f0d39aaaSBarry Smith           ibdiag += 16;
3054f0d39aaaSBarry Smith           break;
3055f0d39aaaSBarry Smith         case 5:
3056f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3057f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3058f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3059f0d39aaaSBarry Smith           v5   = a->a + ii[row + 4];
3060f0d39aaaSBarry Smith           sum1 = b[row];
3061f0d39aaaSBarry Smith           sum2 = b[row + 1];
3062f0d39aaaSBarry Smith           sum3 = b[row + 2];
3063f0d39aaaSBarry Smith           sum4 = b[row + 3];
3064f0d39aaaSBarry Smith           sum5 = b[row + 4];
3065f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3066f0d39aaaSBarry Smith             i1 = idx[0];
3067f0d39aaaSBarry Smith             i2 = idx[1];
3068f0d39aaaSBarry Smith             idx += 2;
3069f0d39aaaSBarry Smith             tmp0 = x[i1];
3070f0d39aaaSBarry Smith             tmp1 = x[i2];
30719371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30729371c9d4SSatish Balay             v1 += 2;
30739371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30749371c9d4SSatish Balay             v2 += 2;
30759371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30769371c9d4SSatish Balay             v3 += 2;
30779371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30789371c9d4SSatish Balay             v4 += 2;
30799371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
30809371c9d4SSatish Balay             v5 += 2;
3081f0d39aaaSBarry Smith           }
3082f0d39aaaSBarry Smith 
3083f0d39aaaSBarry Smith           if (n == sz - 1) {
3084f0d39aaaSBarry Smith             tmp0 = x[*idx];
3085f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3086f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3087f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3088f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3089f0d39aaaSBarry Smith             sum5 -= v5[0] * tmp0;
3090f0d39aaaSBarry Smith           }
30915850ef23SBarry Smith           t[row]     = sum1;
30925850ef23SBarry Smith           t[row + 1] = sum2;
30935850ef23SBarry Smith           t[row + 2] = sum3;
30945850ef23SBarry Smith           t[row + 3] = sum4;
30955850ef23SBarry Smith           t[row + 4] = sum5;
3096f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3097f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3098f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3099f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3100f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3101f0d39aaaSBarry Smith           ibdiag += 25;
3102f0d39aaaSBarry Smith           break;
3103d71ae5a4SJacob Faibussowitsch         default:
3104*4d12350bSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
31058862d2efSBarry Smith         }
31062af78befSBarry Smith       }
31072af78befSBarry Smith 
31085850ef23SBarry Smith       xb = t;
31099566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
31102af78befSBarry Smith     } else xb = b;
31112af78befSBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3112f0d39aaaSBarry Smith       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3113d0f46423SBarry Smith       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3114*4d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
3115*4d12350bSJunchao Zhang         ibdiag -= nodesz * nodesz;
31168862d2efSBarry Smith         sz  = ii[row + 1] - diag[row] - 1;
31178862d2efSBarry Smith         v1  = a->a + diag[row] + 1;
31188862d2efSBarry Smith         idx = a->j + diag[row] + 1;
31192af78befSBarry Smith 
31204108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3121*4d12350bSJunchao Zhang         switch (nodesz) {
31228862d2efSBarry Smith         case 1:
31238862d2efSBarry Smith 
31248862d2efSBarry Smith           sum1 = xb[row];
31258862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
31268862d2efSBarry Smith             i1 = idx[0];
31278862d2efSBarry Smith             i2 = idx[1];
31288862d2efSBarry Smith             idx += 2;
31298862d2efSBarry Smith             tmp0 = x[i1];
31308862d2efSBarry Smith             tmp1 = x[i2];
31319371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31329371c9d4SSatish Balay             v1 += 2;
31338862d2efSBarry Smith           }
31348862d2efSBarry Smith 
31358862d2efSBarry Smith           if (n == sz - 1) {
3136f0d39aaaSBarry Smith             tmp0 = x[*idx];
3137f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
31388862d2efSBarry Smith           }
3139f0d39aaaSBarry Smith           x[row--] = sum1 * (*ibdiag);
3140f0d39aaaSBarry Smith           break;
3141f0d39aaaSBarry Smith 
3142f0d39aaaSBarry Smith         case 2:
3143f0d39aaaSBarry Smith 
3144f0d39aaaSBarry Smith           sum1 = xb[row];
3145f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3146f0d39aaaSBarry Smith           /* note that sum1 is associated with the second of the two rows */
3147f0d39aaaSBarry Smith           v2 = a->a + diag[row - 1] + 2;
3148f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3149f0d39aaaSBarry Smith             i1 = idx[0];
3150f0d39aaaSBarry Smith             i2 = idx[1];
3151f0d39aaaSBarry Smith             idx += 2;
3152f0d39aaaSBarry Smith             tmp0 = x[i1];
3153f0d39aaaSBarry Smith             tmp1 = x[i2];
31549371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31559371c9d4SSatish Balay             v1 += 2;
31569371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31579371c9d4SSatish Balay             v2 += 2;
3158f0d39aaaSBarry Smith           }
3159f0d39aaaSBarry Smith 
3160f0d39aaaSBarry Smith           if (n == sz - 1) {
3161f0d39aaaSBarry Smith             tmp0 = x[*idx];
3162f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3163f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3164f0d39aaaSBarry Smith           }
3165f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3166f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3167f0d39aaaSBarry Smith           break;
3168f0d39aaaSBarry Smith         case 3:
3169f0d39aaaSBarry Smith 
3170f0d39aaaSBarry Smith           sum1 = xb[row];
3171f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3172f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3173f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3174f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3175f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3176f0d39aaaSBarry Smith             i1 = idx[0];
3177f0d39aaaSBarry Smith             i2 = idx[1];
3178f0d39aaaSBarry Smith             idx += 2;
3179f0d39aaaSBarry Smith             tmp0 = x[i1];
3180f0d39aaaSBarry Smith             tmp1 = x[i2];
31819371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31829371c9d4SSatish Balay             v1 += 2;
31839371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31849371c9d4SSatish Balay             v2 += 2;
31859371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31869371c9d4SSatish Balay             v3 += 2;
3187f0d39aaaSBarry Smith           }
3188f0d39aaaSBarry Smith 
3189f0d39aaaSBarry Smith           if (n == sz - 1) {
3190f0d39aaaSBarry Smith             tmp0 = x[*idx];
3191f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3192f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3193f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3194f0d39aaaSBarry Smith           }
3195f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3196f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3197f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3198f0d39aaaSBarry Smith           break;
3199f0d39aaaSBarry Smith         case 4:
3200f0d39aaaSBarry Smith 
3201f0d39aaaSBarry Smith           sum1 = xb[row];
3202f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3203f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3204f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3205f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3206f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3207f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3208f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3209f0d39aaaSBarry Smith             i1 = idx[0];
3210f0d39aaaSBarry Smith             i2 = idx[1];
3211f0d39aaaSBarry Smith             idx += 2;
3212f0d39aaaSBarry Smith             tmp0 = x[i1];
3213f0d39aaaSBarry Smith             tmp1 = x[i2];
32149371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32159371c9d4SSatish Balay             v1 += 2;
32169371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32179371c9d4SSatish Balay             v2 += 2;
32189371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32199371c9d4SSatish Balay             v3 += 2;
32209371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32219371c9d4SSatish Balay             v4 += 2;
3222f0d39aaaSBarry Smith           }
3223f0d39aaaSBarry Smith 
3224f0d39aaaSBarry Smith           if (n == sz - 1) {
3225f0d39aaaSBarry Smith             tmp0 = x[*idx];
3226f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3227f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3228f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3229f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3230f0d39aaaSBarry Smith           }
3231f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3232f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3233f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3234f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3235f0d39aaaSBarry Smith           break;
3236f0d39aaaSBarry Smith         case 5:
3237f0d39aaaSBarry Smith 
3238f0d39aaaSBarry Smith           sum1 = xb[row];
3239f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3240f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3241f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3242f0d39aaaSBarry Smith           sum5 = xb[row - 4];
3243f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3244f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3245f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3246f0d39aaaSBarry Smith           v5   = a->a + diag[row - 4] + 5;
3247f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3248f0d39aaaSBarry Smith             i1 = idx[0];
3249f0d39aaaSBarry Smith             i2 = idx[1];
3250f0d39aaaSBarry Smith             idx += 2;
3251f0d39aaaSBarry Smith             tmp0 = x[i1];
3252f0d39aaaSBarry Smith             tmp1 = x[i2];
32539371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32549371c9d4SSatish Balay             v1 += 2;
32559371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32569371c9d4SSatish Balay             v2 += 2;
32579371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32589371c9d4SSatish Balay             v3 += 2;
32599371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32609371c9d4SSatish Balay             v4 += 2;
32619371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
32629371c9d4SSatish Balay             v5 += 2;
3263f0d39aaaSBarry Smith           }
3264f0d39aaaSBarry Smith 
3265f0d39aaaSBarry Smith           if (n == sz - 1) {
3266f0d39aaaSBarry Smith             tmp0 = x[*idx];
3267f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3268f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3269f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3270f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3271f0d39aaaSBarry Smith             sum5 -= *v5 * tmp0;
3272f0d39aaaSBarry Smith           }
3273f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3274f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3275f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3276f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3277f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
32788862d2efSBarry Smith           break;
3279d71ae5a4SJacob Faibussowitsch         default:
3280*4d12350bSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
32818862d2efSBarry Smith         }
32822af78befSBarry Smith       }
32832af78befSBarry Smith 
32849566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
32852af78befSBarry Smith     }
32862af78befSBarry Smith     its--;
32875850ef23SBarry Smith   }
32885850ef23SBarry Smith   while (its--) {
32895850ef23SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
3290*4d12350bSJunchao Zhang       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
3291*4d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
3292d876e2b0SMark Adams         sz     = diag[row] - ii[row];
32935850ef23SBarry Smith         v1     = a->a + ii[row];
32945850ef23SBarry Smith         idx    = a->j + ii[row];
32955850ef23SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3296*4d12350bSJunchao Zhang         switch (nodesz) {
32975850ef23SBarry Smith         case 1:
32985850ef23SBarry Smith           sum1 = b[row];
32995850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33005850ef23SBarry Smith             i1 = idx[0];
33015850ef23SBarry Smith             i2 = idx[1];
33025850ef23SBarry Smith             idx += 2;
33035850ef23SBarry Smith             tmp0 = x[i1];
33045850ef23SBarry Smith             tmp1 = x[i2];
33059371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33069371c9d4SSatish Balay             v1 += 2;
33075850ef23SBarry Smith           }
33085850ef23SBarry Smith           if (n == sz - 1) {
3309d876e2b0SMark Adams             tmp0 = x[*idx++];
3310d876e2b0SMark Adams             sum1 -= *v1 * tmp0;
3311d876e2b0SMark Adams             v1++;
3312d876e2b0SMark Adams           }
3313d876e2b0SMark Adams           t[row] = sum1;
3314d876e2b0SMark Adams           sz     = ii[row + 1] - diag[row] - 1;
3315d876e2b0SMark Adams           idx    = a->j + diag[row] + 1;
3316d876e2b0SMark Adams           v1 += 1;
3317d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3318d876e2b0SMark Adams             i1 = idx[0];
3319d876e2b0SMark Adams             i2 = idx[1];
3320d876e2b0SMark Adams             idx += 2;
3321d876e2b0SMark Adams             tmp0 = x[i1];
3322d876e2b0SMark Adams             tmp1 = x[i2];
33239371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33249371c9d4SSatish Balay             v1 += 2;
3325d876e2b0SMark Adams           }
3326d876e2b0SMark Adams           if (n == sz - 1) {
3327d876e2b0SMark Adams             tmp0 = x[*idx++];
33285850ef23SBarry Smith             sum1 -= *v1 * tmp0;
33295850ef23SBarry Smith           }
33305850ef23SBarry Smith           /* in MatSOR_SeqAIJ this line would be
33315850ef23SBarry Smith            *
33325850ef23SBarry Smith            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
33335850ef23SBarry Smith            *
33345850ef23SBarry Smith            * but omega == 1, so this becomes
33355850ef23SBarry Smith            *
3336d876e2b0SMark Adams            * x[row] = sum1*(*ibdiag++);
33375850ef23SBarry Smith            *
33385850ef23SBarry Smith            */
3339d876e2b0SMark Adams           x[row] = sum1 * (*ibdiag);
33405850ef23SBarry Smith           break;
33415850ef23SBarry Smith         case 2:
33425850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33435850ef23SBarry Smith           sum1 = b[row];
33445850ef23SBarry Smith           sum2 = b[row + 1];
33455850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33465850ef23SBarry Smith             i1 = idx[0];
33475850ef23SBarry Smith             i2 = idx[1];
33485850ef23SBarry Smith             idx += 2;
33495850ef23SBarry Smith             tmp0 = x[i1];
33505850ef23SBarry Smith             tmp1 = x[i2];
33519371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33529371c9d4SSatish Balay             v1 += 2;
33539371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33549371c9d4SSatish Balay             v2 += 2;
33555850ef23SBarry Smith           }
3356d876e2b0SMark Adams           if (n == sz - 1) {
3357d876e2b0SMark Adams             tmp0 = x[*idx++];
3358d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3359d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
33609371c9d4SSatish Balay             v1++;
33619371c9d4SSatish Balay             v2++;
3362d876e2b0SMark Adams           }
3363d876e2b0SMark Adams           t[row]     = sum1;
3364d876e2b0SMark Adams           t[row + 1] = sum2;
3365d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 2;
3366d876e2b0SMark Adams           idx        = a->j + diag[row] + 2;
3367d876e2b0SMark Adams           v1 += 2;
3368d876e2b0SMark Adams           v2 += 2;
3369d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3370d876e2b0SMark Adams             i1 = idx[0];
3371d876e2b0SMark Adams             i2 = idx[1];
3372d876e2b0SMark Adams             idx += 2;
3373d876e2b0SMark Adams             tmp0 = x[i1];
3374d876e2b0SMark Adams             tmp1 = x[i2];
33759371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33769371c9d4SSatish Balay             v1 += 2;
33779371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33789371c9d4SSatish Balay             v2 += 2;
3379d876e2b0SMark Adams           }
33805850ef23SBarry Smith           if (n == sz - 1) {
33815850ef23SBarry Smith             tmp0 = x[*idx];
33825850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
33835850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
33845850ef23SBarry Smith           }
3385d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3386d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
33875850ef23SBarry Smith           break;
33885850ef23SBarry Smith         case 3:
33895850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33905850ef23SBarry Smith           v3   = a->a + ii[row + 2];
33915850ef23SBarry Smith           sum1 = b[row];
33925850ef23SBarry Smith           sum2 = b[row + 1];
33935850ef23SBarry Smith           sum3 = b[row + 2];
33945850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33955850ef23SBarry Smith             i1 = idx[0];
33965850ef23SBarry Smith             i2 = idx[1];
33975850ef23SBarry Smith             idx += 2;
33985850ef23SBarry Smith             tmp0 = x[i1];
33995850ef23SBarry Smith             tmp1 = x[i2];
34009371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34019371c9d4SSatish Balay             v1 += 2;
34029371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34039371c9d4SSatish Balay             v2 += 2;
34049371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34059371c9d4SSatish Balay             v3 += 2;
34065850ef23SBarry Smith           }
3407d876e2b0SMark Adams           if (n == sz - 1) {
3408d876e2b0SMark Adams             tmp0 = x[*idx++];
3409d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3410d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3411d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
34129371c9d4SSatish Balay             v1++;
34139371c9d4SSatish Balay             v2++;
34149371c9d4SSatish Balay             v3++;
3415d876e2b0SMark Adams           }
3416d876e2b0SMark Adams           t[row]     = sum1;
3417d876e2b0SMark Adams           t[row + 1] = sum2;
3418d876e2b0SMark Adams           t[row + 2] = sum3;
3419d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 3;
3420d876e2b0SMark Adams           idx        = a->j + diag[row] + 3;
3421d876e2b0SMark Adams           v1 += 3;
3422d876e2b0SMark Adams           v2 += 3;
3423d876e2b0SMark Adams           v3 += 3;
3424d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3425d876e2b0SMark Adams             i1 = idx[0];
3426d876e2b0SMark Adams             i2 = idx[1];
3427d876e2b0SMark Adams             idx += 2;
3428d876e2b0SMark Adams             tmp0 = x[i1];
3429d876e2b0SMark Adams             tmp1 = x[i2];
34309371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34319371c9d4SSatish Balay             v1 += 2;
34329371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34339371c9d4SSatish Balay             v2 += 2;
34349371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34359371c9d4SSatish Balay             v3 += 2;
3436d876e2b0SMark Adams           }
34375850ef23SBarry Smith           if (n == sz - 1) {
34385850ef23SBarry Smith             tmp0 = x[*idx];
34395850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34405850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34415850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34425850ef23SBarry Smith           }
3443d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3444d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3445d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
34465850ef23SBarry Smith           break;
34475850ef23SBarry Smith         case 4:
34485850ef23SBarry Smith           v2   = a->a + ii[row + 1];
34495850ef23SBarry Smith           v3   = a->a + ii[row + 2];
34505850ef23SBarry Smith           v4   = a->a + ii[row + 3];
34515850ef23SBarry Smith           sum1 = b[row];
34525850ef23SBarry Smith           sum2 = b[row + 1];
34535850ef23SBarry Smith           sum3 = b[row + 2];
34545850ef23SBarry Smith           sum4 = b[row + 3];
34555850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
34565850ef23SBarry Smith             i1 = idx[0];
34575850ef23SBarry Smith             i2 = idx[1];
34585850ef23SBarry Smith             idx += 2;
34595850ef23SBarry Smith             tmp0 = x[i1];
34605850ef23SBarry Smith             tmp1 = x[i2];
34619371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34629371c9d4SSatish Balay             v1 += 2;
34639371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34649371c9d4SSatish Balay             v2 += 2;
34659371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34669371c9d4SSatish Balay             v3 += 2;
34679371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34689371c9d4SSatish Balay             v4 += 2;
34695850ef23SBarry Smith           }
3470d876e2b0SMark Adams           if (n == sz - 1) {
3471d876e2b0SMark Adams             tmp0 = x[*idx++];
3472d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3473d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3474d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3475d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
34769371c9d4SSatish Balay             v1++;
34779371c9d4SSatish Balay             v2++;
34789371c9d4SSatish Balay             v3++;
34799371c9d4SSatish Balay             v4++;
3480d876e2b0SMark Adams           }
3481d876e2b0SMark Adams           t[row]     = sum1;
3482d876e2b0SMark Adams           t[row + 1] = sum2;
3483d876e2b0SMark Adams           t[row + 2] = sum3;
3484d876e2b0SMark Adams           t[row + 3] = sum4;
3485d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 4;
3486d876e2b0SMark Adams           idx        = a->j + diag[row] + 4;
3487d876e2b0SMark Adams           v1 += 4;
3488d876e2b0SMark Adams           v2 += 4;
3489d876e2b0SMark Adams           v3 += 4;
3490d876e2b0SMark Adams           v4 += 4;
3491d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3492d876e2b0SMark Adams             i1 = idx[0];
3493d876e2b0SMark Adams             i2 = idx[1];
3494d876e2b0SMark Adams             idx += 2;
3495d876e2b0SMark Adams             tmp0 = x[i1];
3496d876e2b0SMark Adams             tmp1 = x[i2];
34979371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34989371c9d4SSatish Balay             v1 += 2;
34999371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35009371c9d4SSatish Balay             v2 += 2;
35019371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35029371c9d4SSatish Balay             v3 += 2;
35039371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35049371c9d4SSatish Balay             v4 += 2;
3505d876e2b0SMark Adams           }
35065850ef23SBarry Smith           if (n == sz - 1) {
35075850ef23SBarry Smith             tmp0 = x[*idx];
35085850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35095850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35105850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35115850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35125850ef23SBarry Smith           }
3513d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3514d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3515d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3516d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
35175850ef23SBarry Smith           break;
35185850ef23SBarry Smith         case 5:
35195850ef23SBarry Smith           v2   = a->a + ii[row + 1];
35205850ef23SBarry Smith           v3   = a->a + ii[row + 2];
35215850ef23SBarry Smith           v4   = a->a + ii[row + 3];
35225850ef23SBarry Smith           v5   = a->a + ii[row + 4];
35235850ef23SBarry Smith           sum1 = b[row];
35245850ef23SBarry Smith           sum2 = b[row + 1];
35255850ef23SBarry Smith           sum3 = b[row + 2];
35265850ef23SBarry Smith           sum4 = b[row + 3];
35275850ef23SBarry Smith           sum5 = b[row + 4];
35285850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35295850ef23SBarry Smith             i1 = idx[0];
35305850ef23SBarry Smith             i2 = idx[1];
35315850ef23SBarry Smith             idx += 2;
35325850ef23SBarry Smith             tmp0 = x[i1];
35335850ef23SBarry Smith             tmp1 = x[i2];
35349371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35359371c9d4SSatish Balay             v1 += 2;
35369371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35379371c9d4SSatish Balay             v2 += 2;
35389371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35399371c9d4SSatish Balay             v3 += 2;
35409371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35419371c9d4SSatish Balay             v4 += 2;
35429371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35439371c9d4SSatish Balay             v5 += 2;
35445850ef23SBarry Smith           }
35455850ef23SBarry Smith           if (n == sz - 1) {
3546d876e2b0SMark Adams             tmp0 = x[*idx++];
35475850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35485850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35495850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35505850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35515850ef23SBarry Smith             sum5 -= v5[0] * tmp0;
35529371c9d4SSatish Balay             v1++;
35539371c9d4SSatish Balay             v2++;
35549371c9d4SSatish Balay             v3++;
35559371c9d4SSatish Balay             v4++;
35569371c9d4SSatish Balay             v5++;
35575850ef23SBarry Smith           }
3558d876e2b0SMark Adams           t[row]     = sum1;
3559d876e2b0SMark Adams           t[row + 1] = sum2;
3560d876e2b0SMark Adams           t[row + 2] = sum3;
3561d876e2b0SMark Adams           t[row + 3] = sum4;
3562d876e2b0SMark Adams           t[row + 4] = sum5;
3563d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 5;
3564d876e2b0SMark Adams           idx        = a->j + diag[row] + 5;
3565d876e2b0SMark Adams           v1 += 5;
3566d876e2b0SMark Adams           v2 += 5;
3567d876e2b0SMark Adams           v3 += 5;
3568d876e2b0SMark Adams           v4 += 5;
3569d876e2b0SMark Adams           v5 += 5;
35705850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35715850ef23SBarry Smith             i1 = idx[0];
35725850ef23SBarry Smith             i2 = idx[1];
35735850ef23SBarry Smith             idx += 2;
35745850ef23SBarry Smith             tmp0 = x[i1];
35755850ef23SBarry Smith             tmp1 = x[i2];
35769371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35779371c9d4SSatish Balay             v1 += 2;
35789371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35799371c9d4SSatish Balay             v2 += 2;
35809371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35819371c9d4SSatish Balay             v3 += 2;
35829371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35839371c9d4SSatish Balay             v4 += 2;
35849371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35859371c9d4SSatish Balay             v5 += 2;
35865850ef23SBarry Smith           }
35875850ef23SBarry Smith           if (n == sz - 1) {
35885850ef23SBarry Smith             tmp0 = x[*idx];
3589d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3590d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3591d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3592d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
3593d876e2b0SMark Adams             sum5 -= v5[0] * tmp0;
35945850ef23SBarry Smith           }
3595d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3596d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3597d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3598d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3599d876e2b0SMark Adams           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3600d876e2b0SMark Adams           break;
3601d71ae5a4SJacob Faibussowitsch         default:
3602*4d12350bSJunchao Zhang           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3603d876e2b0SMark Adams         }
3604d876e2b0SMark Adams       }
3605d876e2b0SMark Adams       xb = t;
36069566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3607d876e2b0SMark Adams     } else xb = b;
3608d876e2b0SMark Adams 
3609d876e2b0SMark Adams     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3610d876e2b0SMark Adams       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3611d876e2b0SMark Adams       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3612*4d12350bSJunchao Zhang         nodesz = sizes[i + 1] - sizes[i];
3613*4d12350bSJunchao Zhang         ibdiag -= nodesz * nodesz;
3614d876e2b0SMark Adams 
3615d876e2b0SMark Adams         /* set RHS */
3616d876e2b0SMark Adams         if (xb == b) {
3617d876e2b0SMark Adams           /* whole (old way) */
3618d876e2b0SMark Adams           sz  = ii[row + 1] - ii[row];
3619d876e2b0SMark Adams           idx = a->j + ii[row];
3620*4d12350bSJunchao Zhang           switch (nodesz) {
3621d71ae5a4SJacob Faibussowitsch           case 5:
3622d71ae5a4SJacob Faibussowitsch             v5 = a->a + ii[row - 4]; /* fall through */
3623d71ae5a4SJacob Faibussowitsch           case 4:
3624d71ae5a4SJacob Faibussowitsch             v4 = a->a + ii[row - 3]; /* fall through */
3625d71ae5a4SJacob Faibussowitsch           case 3:
3626d71ae5a4SJacob Faibussowitsch             v3 = a->a + ii[row - 2]; /* fall through */
3627d71ae5a4SJacob Faibussowitsch           case 2:
3628d71ae5a4SJacob Faibussowitsch             v2 = a->a + ii[row - 1]; /* fall through */
3629d71ae5a4SJacob Faibussowitsch           case 1:
3630d71ae5a4SJacob Faibussowitsch             v1 = a->a + ii[row];
3631d71ae5a4SJacob Faibussowitsch             break;
3632d71ae5a4SJacob Faibussowitsch           default:
3633*4d12350bSJunchao Zhang             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3634d876e2b0SMark Adams           }
3635d876e2b0SMark Adams         } else {
3636d876e2b0SMark Adams           /* upper, no diag */
3637d876e2b0SMark Adams           sz  = ii[row + 1] - diag[row] - 1;
3638d876e2b0SMark Adams           idx = a->j + diag[row] + 1;
3639*4d12350bSJunchao Zhang           switch (nodesz) {
3640d71ae5a4SJacob Faibussowitsch           case 5:
3641d71ae5a4SJacob Faibussowitsch             v5 = a->a + diag[row - 4] + 5; /* fall through */
3642d71ae5a4SJacob Faibussowitsch           case 4:
3643d71ae5a4SJacob Faibussowitsch             v4 = a->a + diag[row - 3] + 4; /* fall through */
3644d71ae5a4SJacob Faibussowitsch           case 3:
3645d71ae5a4SJacob Faibussowitsch             v3 = a->a + diag[row - 2] + 3; /* fall through */
3646d71ae5a4SJacob Faibussowitsch           case 2:
3647d71ae5a4SJacob Faibussowitsch             v2 = a->a + diag[row - 1] + 2; /* fall through */
3648d71ae5a4SJacob Faibussowitsch           case 1:
3649d71ae5a4SJacob Faibussowitsch             v1 = a->a + diag[row] + 1;
3650d876e2b0SMark Adams           }
3651d876e2b0SMark Adams         }
3652d876e2b0SMark Adams         /* set sum */
3653*4d12350bSJunchao Zhang         switch (nodesz) {
3654d71ae5a4SJacob Faibussowitsch         case 5:
3655d71ae5a4SJacob Faibussowitsch           sum5 = xb[row - 4]; /* fall through */
3656d71ae5a4SJacob Faibussowitsch         case 4:
3657d71ae5a4SJacob Faibussowitsch           sum4 = xb[row - 3]; /* fall through */
3658d71ae5a4SJacob Faibussowitsch         case 3:
3659d71ae5a4SJacob Faibussowitsch           sum3 = xb[row - 2]; /* fall through */
3660d71ae5a4SJacob Faibussowitsch         case 2:
3661d71ae5a4SJacob Faibussowitsch           sum2 = xb[row - 1]; /* fall through */
3662d876e2b0SMark Adams         case 1:
3663d876e2b0SMark Adams           /* note that sum1 is associated with the last row */
3664d876e2b0SMark Adams           sum1 = xb[row];
3665d876e2b0SMark Adams         }
3666d876e2b0SMark Adams         /* do sums */
3667d876e2b0SMark Adams         for (n = 0; n < sz - 1; n += 2) {
3668d876e2b0SMark Adams           i1 = idx[0];
3669d876e2b0SMark Adams           i2 = idx[1];
3670d876e2b0SMark Adams           idx += 2;
3671d876e2b0SMark Adams           tmp0 = x[i1];
3672d876e2b0SMark Adams           tmp1 = x[i2];
3673*4d12350bSJunchao Zhang           switch (nodesz) {
3674d71ae5a4SJacob Faibussowitsch           case 5:
3675d71ae5a4SJacob Faibussowitsch             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3676d71ae5a4SJacob Faibussowitsch             v5 += 2; /* fall through */
3677d71ae5a4SJacob Faibussowitsch           case 4:
3678d71ae5a4SJacob Faibussowitsch             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3679d71ae5a4SJacob Faibussowitsch             v4 += 2; /* fall through */
3680d71ae5a4SJacob Faibussowitsch           case 3:
3681d71ae5a4SJacob Faibussowitsch             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3682d71ae5a4SJacob Faibussowitsch             v3 += 2; /* fall through */
3683d71ae5a4SJacob Faibussowitsch           case 2:
3684d71ae5a4SJacob Faibussowitsch             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3685d71ae5a4SJacob Faibussowitsch             v2 += 2; /* fall through */
3686d71ae5a4SJacob Faibussowitsch           case 1:
3687d71ae5a4SJacob Faibussowitsch             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3688d71ae5a4SJacob Faibussowitsch             v1 += 2;
3689d876e2b0SMark Adams           }
3690d876e2b0SMark Adams         }
3691d876e2b0SMark Adams         /* ragged edge */
3692d876e2b0SMark Adams         if (n == sz - 1) {
3693d876e2b0SMark Adams           tmp0 = x[*idx];
3694*4d12350bSJunchao Zhang           switch (nodesz) {
3695d71ae5a4SJacob Faibussowitsch           case 5:
3696d71ae5a4SJacob Faibussowitsch             sum5 -= *v5 * tmp0; /* fall through */
3697d71ae5a4SJacob Faibussowitsch           case 4:
3698d71ae5a4SJacob Faibussowitsch             sum4 -= *v4 * tmp0; /* fall through */
3699d71ae5a4SJacob Faibussowitsch           case 3:
3700d71ae5a4SJacob Faibussowitsch             sum3 -= *v3 * tmp0; /* fall through */
3701d71ae5a4SJacob Faibussowitsch           case 2:
3702d71ae5a4SJacob Faibussowitsch             sum2 -= *v2 * tmp0; /* fall through */
3703d71ae5a4SJacob Faibussowitsch           case 1:
3704d71ae5a4SJacob Faibussowitsch             sum1 -= *v1 * tmp0;
3705d876e2b0SMark Adams           }
3706d876e2b0SMark Adams         }
3707d876e2b0SMark Adams         /* update */
3708d876e2b0SMark Adams         if (xb == b) {
3709d876e2b0SMark Adams           /* whole (old way) w/ diag */
3710*4d12350bSJunchao Zhang           switch (nodesz) {
3711d876e2b0SMark Adams           case 5:
37125850ef23SBarry Smith             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
37135850ef23SBarry Smith             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
37145850ef23SBarry Smith             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
37155850ef23SBarry Smith             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
37165850ef23SBarry Smith             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
37175850ef23SBarry Smith             break;
3718d876e2b0SMark Adams           case 4:
3719d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3720d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3721d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3722d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3723d876e2b0SMark Adams             break;
3724d876e2b0SMark Adams           case 3:
3725d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3726d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3727d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3728d876e2b0SMark Adams             break;
3729d876e2b0SMark Adams           case 2:
3730d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3731d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3732d876e2b0SMark Adams             break;
3733d71ae5a4SJacob Faibussowitsch           case 1:
3734d71ae5a4SJacob Faibussowitsch             x[row--] += sum1 * (*ibdiag);
3735d71ae5a4SJacob Faibussowitsch             break;
3736d876e2b0SMark Adams           }
3737d876e2b0SMark Adams         } else {
3738d876e2b0SMark Adams           /* no diag so set =  */
3739*4d12350bSJunchao Zhang           switch (nodesz) {
3740d876e2b0SMark Adams           case 5:
3741d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3742d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3743d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3744d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3745d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3746d876e2b0SMark Adams             break;
3747d876e2b0SMark Adams           case 4:
3748d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3749d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3750d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3751d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3752d876e2b0SMark Adams             break;
3753d876e2b0SMark Adams           case 3:
3754d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3755d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3756d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3757d876e2b0SMark Adams             break;
3758d876e2b0SMark Adams           case 2:
3759d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3760d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3761d876e2b0SMark Adams             break;
3762d71ae5a4SJacob Faibussowitsch           case 1:
3763d71ae5a4SJacob Faibussowitsch             x[row--] = sum1 * (*ibdiag);
3764d71ae5a4SJacob Faibussowitsch             break;
37655850ef23SBarry Smith           }
37665850ef23SBarry Smith         }
3767d876e2b0SMark Adams       }
3768d876e2b0SMark Adams       if (xb == b) {
37699566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
3770d876e2b0SMark Adams       } else {
37719566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3772d876e2b0SMark Adams       }
37735850ef23SBarry Smith     }
37742af78befSBarry Smith   }
377589c6957cSBarry Smith   if (flag & SOR_EISENSTAT) {
377689c6957cSBarry Smith     /*
377789c6957cSBarry Smith           Apply  (U + D)^-1  where D is now the block diagonal
377889c6957cSBarry Smith     */
377989c6957cSBarry Smith     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
378089c6957cSBarry Smith     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3781*4d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
3782*4d12350bSJunchao Zhang       ibdiag -= nodesz * nodesz;
378389c6957cSBarry Smith       sz  = ii[row + 1] - diag[row] - 1;
378489c6957cSBarry Smith       v1  = a->a + diag[row] + 1;
378589c6957cSBarry Smith       idx = a->j + diag[row] + 1;
37864108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
3787*4d12350bSJunchao Zhang       switch (nodesz) {
378889c6957cSBarry Smith       case 1:
378989c6957cSBarry Smith 
379089c6957cSBarry Smith         sum1 = b[row];
379189c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
379289c6957cSBarry Smith           i1 = idx[0];
379389c6957cSBarry Smith           i2 = idx[1];
379489c6957cSBarry Smith           idx += 2;
379589c6957cSBarry Smith           tmp0 = x[i1];
379689c6957cSBarry Smith           tmp1 = x[i2];
37979371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37989371c9d4SSatish Balay           v1 += 2;
379989c6957cSBarry Smith         }
380089c6957cSBarry Smith 
380189c6957cSBarry Smith         if (n == sz - 1) {
380289c6957cSBarry Smith           tmp0 = x[*idx];
380389c6957cSBarry Smith           sum1 -= *v1 * tmp0;
380489c6957cSBarry Smith         }
38059371c9d4SSatish Balay         x[row] = sum1 * (*ibdiag);
38069371c9d4SSatish Balay         row--;
380789c6957cSBarry Smith         break;
380889c6957cSBarry Smith 
380989c6957cSBarry Smith       case 2:
381089c6957cSBarry Smith 
381189c6957cSBarry Smith         sum1 = b[row];
381289c6957cSBarry Smith         sum2 = b[row - 1];
381389c6957cSBarry Smith         /* note that sum1 is associated with the second of the two rows */
381489c6957cSBarry Smith         v2 = a->a + diag[row - 1] + 2;
381589c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
381689c6957cSBarry Smith           i1 = idx[0];
381789c6957cSBarry Smith           i2 = idx[1];
381889c6957cSBarry Smith           idx += 2;
381989c6957cSBarry Smith           tmp0 = x[i1];
382089c6957cSBarry Smith           tmp1 = x[i2];
38219371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38229371c9d4SSatish Balay           v1 += 2;
38239371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38249371c9d4SSatish Balay           v2 += 2;
382589c6957cSBarry Smith         }
382689c6957cSBarry Smith 
382789c6957cSBarry Smith         if (n == sz - 1) {
382889c6957cSBarry Smith           tmp0 = x[*idx];
382989c6957cSBarry Smith           sum1 -= *v1 * tmp0;
383089c6957cSBarry Smith           sum2 -= *v2 * tmp0;
383189c6957cSBarry Smith         }
3832938d4eb3SBarry Smith         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3833938d4eb3SBarry Smith         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3834938d4eb3SBarry Smith         row -= 2;
383589c6957cSBarry Smith         break;
383689c6957cSBarry Smith       case 3:
383789c6957cSBarry Smith 
383889c6957cSBarry Smith         sum1 = b[row];
383989c6957cSBarry Smith         sum2 = b[row - 1];
384089c6957cSBarry Smith         sum3 = b[row - 2];
384189c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
384289c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
384389c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
384489c6957cSBarry Smith           i1 = idx[0];
384589c6957cSBarry Smith           i2 = idx[1];
384689c6957cSBarry Smith           idx += 2;
384789c6957cSBarry Smith           tmp0 = x[i1];
384889c6957cSBarry Smith           tmp1 = x[i2];
38499371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38509371c9d4SSatish Balay           v1 += 2;
38519371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38529371c9d4SSatish Balay           v2 += 2;
38539371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38549371c9d4SSatish Balay           v3 += 2;
385589c6957cSBarry Smith         }
385689c6957cSBarry Smith 
385789c6957cSBarry Smith         if (n == sz - 1) {
385889c6957cSBarry Smith           tmp0 = x[*idx];
385989c6957cSBarry Smith           sum1 -= *v1 * tmp0;
386089c6957cSBarry Smith           sum2 -= *v2 * tmp0;
386189c6957cSBarry Smith           sum3 -= *v3 * tmp0;
386289c6957cSBarry Smith         }
3863938d4eb3SBarry Smith         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3864938d4eb3SBarry Smith         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3865938d4eb3SBarry Smith         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3866938d4eb3SBarry Smith         row -= 3;
386789c6957cSBarry Smith         break;
386889c6957cSBarry Smith       case 4:
386989c6957cSBarry Smith 
387089c6957cSBarry Smith         sum1 = b[row];
387189c6957cSBarry Smith         sum2 = b[row - 1];
387289c6957cSBarry Smith         sum3 = b[row - 2];
387389c6957cSBarry Smith         sum4 = b[row - 3];
387489c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
387589c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
387689c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
387789c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
387889c6957cSBarry Smith           i1 = idx[0];
387989c6957cSBarry Smith           i2 = idx[1];
388089c6957cSBarry Smith           idx += 2;
388189c6957cSBarry Smith           tmp0 = x[i1];
388289c6957cSBarry Smith           tmp1 = x[i2];
38839371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38849371c9d4SSatish Balay           v1 += 2;
38859371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38869371c9d4SSatish Balay           v2 += 2;
38879371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38889371c9d4SSatish Balay           v3 += 2;
38899371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
38909371c9d4SSatish Balay           v4 += 2;
389189c6957cSBarry Smith         }
389289c6957cSBarry Smith 
389389c6957cSBarry Smith         if (n == sz - 1) {
389489c6957cSBarry Smith           tmp0 = x[*idx];
389589c6957cSBarry Smith           sum1 -= *v1 * tmp0;
389689c6957cSBarry Smith           sum2 -= *v2 * tmp0;
389789c6957cSBarry Smith           sum3 -= *v3 * tmp0;
389889c6957cSBarry Smith           sum4 -= *v4 * tmp0;
389989c6957cSBarry Smith         }
3900938d4eb3SBarry Smith         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3901938d4eb3SBarry Smith         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3902938d4eb3SBarry Smith         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3903938d4eb3SBarry Smith         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3904938d4eb3SBarry Smith         row -= 4;
390589c6957cSBarry Smith         break;
390689c6957cSBarry Smith       case 5:
390789c6957cSBarry Smith 
390889c6957cSBarry Smith         sum1 = b[row];
390989c6957cSBarry Smith         sum2 = b[row - 1];
391089c6957cSBarry Smith         sum3 = b[row - 2];
391189c6957cSBarry Smith         sum4 = b[row - 3];
391289c6957cSBarry Smith         sum5 = b[row - 4];
391389c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
391489c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
391589c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
391689c6957cSBarry Smith         v5   = a->a + diag[row - 4] + 5;
391789c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
391889c6957cSBarry Smith           i1 = idx[0];
391989c6957cSBarry Smith           i2 = idx[1];
392089c6957cSBarry Smith           idx += 2;
392189c6957cSBarry Smith           tmp0 = x[i1];
392289c6957cSBarry Smith           tmp1 = x[i2];
39239371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
39249371c9d4SSatish Balay           v1 += 2;
39259371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
39269371c9d4SSatish Balay           v2 += 2;
39279371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
39289371c9d4SSatish Balay           v3 += 2;
39299371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
39309371c9d4SSatish Balay           v4 += 2;
39319371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
39329371c9d4SSatish Balay           v5 += 2;
393389c6957cSBarry Smith         }
393489c6957cSBarry Smith 
393589c6957cSBarry Smith         if (n == sz - 1) {
393689c6957cSBarry Smith           tmp0 = x[*idx];
393789c6957cSBarry Smith           sum1 -= *v1 * tmp0;
393889c6957cSBarry Smith           sum2 -= *v2 * tmp0;
393989c6957cSBarry Smith           sum3 -= *v3 * tmp0;
394089c6957cSBarry Smith           sum4 -= *v4 * tmp0;
394189c6957cSBarry Smith           sum5 -= *v5 * tmp0;
394289c6957cSBarry Smith         }
3943938d4eb3SBarry Smith         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3944938d4eb3SBarry Smith         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3945938d4eb3SBarry Smith         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3946938d4eb3SBarry Smith         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3947938d4eb3SBarry Smith         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3948938d4eb3SBarry Smith         row -= 5;
394989c6957cSBarry Smith         break;
3950d71ae5a4SJacob Faibussowitsch       default:
3951*4d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
395289c6957cSBarry Smith       }
395389c6957cSBarry Smith     }
39549566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
395589c6957cSBarry Smith 
395689c6957cSBarry Smith     /*
395789c6957cSBarry Smith            t = b - D x    where D is the block diagonal
395889c6957cSBarry Smith     */
395989c6957cSBarry Smith     cnt = 0;
396089c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
3961*4d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
3962*4d12350bSJunchao Zhang       switch (nodesz) {
396389c6957cSBarry Smith       case 1:
39649371c9d4SSatish Balay         t[row] = b[row] - bdiag[cnt++] * x[row];
39659371c9d4SSatish Balay         row++;
396689c6957cSBarry Smith         break;
396789c6957cSBarry Smith       case 2:
39689371c9d4SSatish Balay         x1         = x[row];
39699371c9d4SSatish Balay         x2         = x[row + 1];
397089c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
397189c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
397289c6957cSBarry Smith         t[row]     = b[row] - tmp1;
39739371c9d4SSatish Balay         t[row + 1] = b[row + 1] - tmp2;
39749371c9d4SSatish Balay         row += 2;
397589c6957cSBarry Smith         cnt += 4;
397689c6957cSBarry Smith         break;
397789c6957cSBarry Smith       case 3:
39789371c9d4SSatish Balay         x1         = x[row];
39799371c9d4SSatish Balay         x2         = x[row + 1];
39809371c9d4SSatish Balay         x3         = x[row + 2];
398189c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
398289c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
398389c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
398489c6957cSBarry Smith         t[row]     = b[row] - tmp1;
398589c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
39869371c9d4SSatish Balay         t[row + 2] = b[row + 2] - tmp3;
39879371c9d4SSatish Balay         row += 3;
398889c6957cSBarry Smith         cnt += 9;
398989c6957cSBarry Smith         break;
399089c6957cSBarry Smith       case 4:
39919371c9d4SSatish Balay         x1         = x[row];
39929371c9d4SSatish Balay         x2         = x[row + 1];
39939371c9d4SSatish Balay         x3         = x[row + 2];
39949371c9d4SSatish Balay         x4         = x[row + 3];
399589c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
399689c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
399789c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
399889c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
399989c6957cSBarry Smith         t[row]     = b[row] - tmp1;
400089c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
400189c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
40029371c9d4SSatish Balay         t[row + 3] = b[row + 3] - tmp4;
40039371c9d4SSatish Balay         row += 4;
400489c6957cSBarry Smith         cnt += 16;
400589c6957cSBarry Smith         break;
400689c6957cSBarry Smith       case 5:
40079371c9d4SSatish Balay         x1         = x[row];
40089371c9d4SSatish Balay         x2         = x[row + 1];
40099371c9d4SSatish Balay         x3         = x[row + 2];
40109371c9d4SSatish Balay         x4         = x[row + 3];
40119371c9d4SSatish Balay         x5         = x[row + 4];
401289c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
401389c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
401489c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
401589c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
401689c6957cSBarry Smith         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
401789c6957cSBarry Smith         t[row]     = b[row] - tmp1;
401889c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
401989c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
402089c6957cSBarry Smith         t[row + 3] = b[row + 3] - tmp4;
40219371c9d4SSatish Balay         t[row + 4] = b[row + 4] - tmp5;
40229371c9d4SSatish Balay         row += 5;
402389c6957cSBarry Smith         cnt += 25;
402489c6957cSBarry Smith         break;
4025d71ae5a4SJacob Faibussowitsch       default:
4026*4d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
402789c6957cSBarry Smith       }
402889c6957cSBarry Smith     }
40299566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(m));
403089c6957cSBarry Smith 
403189c6957cSBarry Smith     /*
403289c6957cSBarry Smith           Apply (L + D)^-1 where D is the block diagonal
403389c6957cSBarry Smith     */
403489c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
4035*4d12350bSJunchao Zhang       nodesz = sizes[i + 1] - sizes[i];
403689c6957cSBarry Smith       sz     = diag[row] - ii[row];
403789c6957cSBarry Smith       v1     = a->a + ii[row];
403889c6957cSBarry Smith       idx    = a->j + ii[row];
40394108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
4040*4d12350bSJunchao Zhang       switch (nodesz) {
404189c6957cSBarry Smith       case 1:
404289c6957cSBarry Smith 
404389c6957cSBarry Smith         sum1 = t[row];
404489c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
404589c6957cSBarry Smith           i1 = idx[0];
404689c6957cSBarry Smith           i2 = idx[1];
404789c6957cSBarry Smith           idx += 2;
404889c6957cSBarry Smith           tmp0 = t[i1];
404989c6957cSBarry Smith           tmp1 = t[i2];
40509371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40519371c9d4SSatish Balay           v1 += 2;
405289c6957cSBarry Smith         }
405389c6957cSBarry Smith 
405489c6957cSBarry Smith         if (n == sz - 1) {
405589c6957cSBarry Smith           tmp0 = t[*idx];
405689c6957cSBarry Smith           sum1 -= *v1 * tmp0;
405789c6957cSBarry Smith         }
40589371c9d4SSatish Balay         x[row] += t[row] = sum1 * (*ibdiag++);
40599371c9d4SSatish Balay         row++;
406089c6957cSBarry Smith         break;
406189c6957cSBarry Smith       case 2:
406289c6957cSBarry Smith         v2   = a->a + ii[row + 1];
406389c6957cSBarry Smith         sum1 = t[row];
406489c6957cSBarry Smith         sum2 = t[row + 1];
406589c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
406689c6957cSBarry Smith           i1 = idx[0];
406789c6957cSBarry Smith           i2 = idx[1];
406889c6957cSBarry Smith           idx += 2;
406989c6957cSBarry Smith           tmp0 = t[i1];
407089c6957cSBarry Smith           tmp1 = t[i2];
40719371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40729371c9d4SSatish Balay           v1 += 2;
40739371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40749371c9d4SSatish Balay           v2 += 2;
407589c6957cSBarry Smith         }
407689c6957cSBarry Smith 
407789c6957cSBarry Smith         if (n == sz - 1) {
407889c6957cSBarry Smith           tmp0 = t[*idx];
407989c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
408089c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
408189c6957cSBarry Smith         }
408289c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
408389c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
40849371c9d4SSatish Balay         ibdiag += 4;
40859371c9d4SSatish Balay         row += 2;
408689c6957cSBarry Smith         break;
408789c6957cSBarry Smith       case 3:
408889c6957cSBarry Smith         v2   = a->a + ii[row + 1];
408989c6957cSBarry Smith         v3   = a->a + ii[row + 2];
409089c6957cSBarry Smith         sum1 = t[row];
409189c6957cSBarry Smith         sum2 = t[row + 1];
409289c6957cSBarry Smith         sum3 = t[row + 2];
409389c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
409489c6957cSBarry Smith           i1 = idx[0];
409589c6957cSBarry Smith           i2 = idx[1];
409689c6957cSBarry Smith           idx += 2;
409789c6957cSBarry Smith           tmp0 = t[i1];
409889c6957cSBarry Smith           tmp1 = t[i2];
40999371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41009371c9d4SSatish Balay           v1 += 2;
41019371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41029371c9d4SSatish Balay           v2 += 2;
41039371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41049371c9d4SSatish Balay           v3 += 2;
410589c6957cSBarry Smith         }
410689c6957cSBarry Smith 
410789c6957cSBarry Smith         if (n == sz - 1) {
410889c6957cSBarry Smith           tmp0 = t[*idx];
410989c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
411089c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
411189c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
411289c6957cSBarry Smith         }
411389c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
411489c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
411589c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
41169371c9d4SSatish Balay         ibdiag += 9;
41179371c9d4SSatish Balay         row += 3;
411889c6957cSBarry Smith         break;
411989c6957cSBarry Smith       case 4:
412089c6957cSBarry Smith         v2   = a->a + ii[row + 1];
412189c6957cSBarry Smith         v3   = a->a + ii[row + 2];
412289c6957cSBarry Smith         v4   = a->a + ii[row + 3];
412389c6957cSBarry Smith         sum1 = t[row];
412489c6957cSBarry Smith         sum2 = t[row + 1];
412589c6957cSBarry Smith         sum3 = t[row + 2];
412689c6957cSBarry Smith         sum4 = t[row + 3];
412789c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
412889c6957cSBarry Smith           i1 = idx[0];
412989c6957cSBarry Smith           i2 = idx[1];
413089c6957cSBarry Smith           idx += 2;
413189c6957cSBarry Smith           tmp0 = t[i1];
413289c6957cSBarry Smith           tmp1 = t[i2];
41339371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41349371c9d4SSatish Balay           v1 += 2;
41359371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41369371c9d4SSatish Balay           v2 += 2;
41379371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41389371c9d4SSatish Balay           v3 += 2;
41399371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41409371c9d4SSatish Balay           v4 += 2;
414189c6957cSBarry Smith         }
414289c6957cSBarry Smith 
414389c6957cSBarry Smith         if (n == sz - 1) {
414489c6957cSBarry Smith           tmp0 = t[*idx];
414589c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
414689c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
414789c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
414889c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
414989c6957cSBarry Smith         }
415089c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
415189c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
415289c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
415389c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
41549371c9d4SSatish Balay         ibdiag += 16;
41559371c9d4SSatish Balay         row += 4;
415689c6957cSBarry Smith         break;
415789c6957cSBarry Smith       case 5:
415889c6957cSBarry Smith         v2   = a->a + ii[row + 1];
415989c6957cSBarry Smith         v3   = a->a + ii[row + 2];
416089c6957cSBarry Smith         v4   = a->a + ii[row + 3];
416189c6957cSBarry Smith         v5   = a->a + ii[row + 4];
416289c6957cSBarry Smith         sum1 = t[row];
416389c6957cSBarry Smith         sum2 = t[row + 1];
416489c6957cSBarry Smith         sum3 = t[row + 2];
416589c6957cSBarry Smith         sum4 = t[row + 3];
416689c6957cSBarry Smith         sum5 = t[row + 4];
416789c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
416889c6957cSBarry Smith           i1 = idx[0];
416989c6957cSBarry Smith           i2 = idx[1];
417089c6957cSBarry Smith           idx += 2;
417189c6957cSBarry Smith           tmp0 = t[i1];
417289c6957cSBarry Smith           tmp1 = t[i2];
41739371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41749371c9d4SSatish Balay           v1 += 2;
41759371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41769371c9d4SSatish Balay           v2 += 2;
41779371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41789371c9d4SSatish Balay           v3 += 2;
41799371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41809371c9d4SSatish Balay           v4 += 2;
41819371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
41829371c9d4SSatish Balay           v5 += 2;
418389c6957cSBarry Smith         }
418489c6957cSBarry Smith 
418589c6957cSBarry Smith         if (n == sz - 1) {
418689c6957cSBarry Smith           tmp0 = t[*idx];
418789c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
418889c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
418989c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
419089c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
419189c6957cSBarry Smith           sum5 -= v5[0] * tmp0;
419289c6957cSBarry Smith         }
419389c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
419489c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
419589c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
419689c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
419789c6957cSBarry Smith         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
41989371c9d4SSatish Balay         ibdiag += 25;
41999371c9d4SSatish Balay         row += 5;
420089c6957cSBarry Smith         break;
4201d71ae5a4SJacob Faibussowitsch       default:
4202*4d12350bSJunchao Zhang         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
420389c6957cSBarry Smith       }
420489c6957cSBarry Smith     }
42059566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
42065850ef23SBarry Smith   }
42079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
42093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
42102af78befSBarry Smith }
42112af78befSBarry Smith 
4212ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4213d71ae5a4SJacob Faibussowitsch {
421489c6957cSBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
421589c6957cSBarry Smith   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
421689c6957cSBarry Smith   const MatScalar   *bdiag = a->inode.bdiag;
421789c6957cSBarry Smith   const PetscScalar *b;
4218*4d12350bSJunchao Zhang   PetscInt           m = a->inode.node_count, cnt = 0, i, row, nodesz;
4219*4d12350bSJunchao Zhang   const PetscInt    *sizes = a->inode.size_csr;
42202af78befSBarry Smith 
422189c6957cSBarry Smith   PetscFunctionBegin;
4222*4d12350bSJunchao Zhang   PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
42239566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
422589c6957cSBarry Smith   cnt = 0;
422689c6957cSBarry Smith   for (i = 0, row = 0; i < m; i++) {
4227*4d12350bSJunchao Zhang     nodesz = sizes[i + 1] - sizes[i];
4228*4d12350bSJunchao Zhang     switch (nodesz) {
422989c6957cSBarry Smith     case 1:
42309371c9d4SSatish Balay       x[row] = b[row] * bdiag[cnt++];
42319371c9d4SSatish Balay       row++;
423289c6957cSBarry Smith       break;
423389c6957cSBarry Smith     case 2:
42349371c9d4SSatish Balay       x1       = b[row];
42359371c9d4SSatish Balay       x2       = b[row + 1];
423689c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
423789c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
423889c6957cSBarry Smith       x[row++] = tmp1;
423989c6957cSBarry Smith       x[row++] = tmp2;
424089c6957cSBarry Smith       cnt += 4;
424189c6957cSBarry Smith       break;
424289c6957cSBarry Smith     case 3:
42439371c9d4SSatish Balay       x1       = b[row];
42449371c9d4SSatish Balay       x2       = b[row + 1];
42459371c9d4SSatish Balay       x3       = b[row + 2];
424689c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
424789c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
424889c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
424989c6957cSBarry Smith       x[row++] = tmp1;
425089c6957cSBarry Smith       x[row++] = tmp2;
425189c6957cSBarry Smith       x[row++] = tmp3;
425289c6957cSBarry Smith       cnt += 9;
425389c6957cSBarry Smith       break;
425489c6957cSBarry Smith     case 4:
42559371c9d4SSatish Balay       x1       = b[row];
42569371c9d4SSatish Balay       x2       = b[row + 1];
42579371c9d4SSatish Balay       x3       = b[row + 2];
42589371c9d4SSatish Balay       x4       = b[row + 3];
425989c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
426089c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
426189c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
426289c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
426389c6957cSBarry Smith       x[row++] = tmp1;
426489c6957cSBarry Smith       x[row++] = tmp2;
426589c6957cSBarry Smith       x[row++] = tmp3;
426689c6957cSBarry Smith       x[row++] = tmp4;
426789c6957cSBarry Smith       cnt += 16;
426889c6957cSBarry Smith       break;
426989c6957cSBarry Smith     case 5:
42709371c9d4SSatish Balay       x1       = b[row];
42719371c9d4SSatish Balay       x2       = b[row + 1];
42729371c9d4SSatish Balay       x3       = b[row + 2];
42739371c9d4SSatish Balay       x4       = b[row + 3];
42749371c9d4SSatish Balay       x5       = b[row + 4];
427589c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
427689c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
427789c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
427889c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
427989c6957cSBarry Smith       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
428089c6957cSBarry Smith       x[row++] = tmp1;
428189c6957cSBarry Smith       x[row++] = tmp2;
428289c6957cSBarry Smith       x[row++] = tmp3;
428389c6957cSBarry Smith       x[row++] = tmp4;
428489c6957cSBarry Smith       x[row++] = tmp5;
428589c6957cSBarry Smith       cnt += 25;
428689c6957cSBarry Smith       break;
4287d71ae5a4SJacob Faibussowitsch     default:
4288*4d12350bSJunchao Zhang       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
428989c6957cSBarry Smith     }
429089c6957cSBarry Smith   }
42919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * cnt));
42929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
42943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
429589c6957cSBarry Smith }
429689c6957cSBarry Smith 
4297d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4298d71ae5a4SJacob Faibussowitsch {
4299b215bc84SStefano Zampini   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4300b215bc84SStefano Zampini 
4301b215bc84SStefano Zampini   PetscFunctionBegin;
4302b215bc84SStefano Zampini   a->inode.node_count       = 0;
4303b215bc84SStefano Zampini   a->inode.use              = PETSC_FALSE;
4304b215bc84SStefano Zampini   a->inode.checked          = PETSC_FALSE;
4305b215bc84SStefano Zampini   a->inode.mat_nonzerostate = -1;
4306b215bc84SStefano Zampini   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4307b215bc84SStefano Zampini   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4308b215bc84SStefano Zampini   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4309b215bc84SStefano Zampini   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4310b215bc84SStefano Zampini   A->ops->coloringpatch     = NULL;
4311b215bc84SStefano Zampini   A->ops->multdiagonalblock = NULL;
4312ad540459SPierre Jolivet   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
43133ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4314b215bc84SStefano Zampini }
4315b215bc84SStefano Zampini 
43164c1414c8SBarry Smith /*
43174c1414c8SBarry Smith     samestructure indicates that the matrix has not changed its nonzero structure so we
43184c1414c8SBarry Smith     do not need to recompute the inodes
43194c1414c8SBarry Smith */
4320d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A)
4321d71ae5a4SJacob Faibussowitsch {
43224c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
43238758e1faSBarry Smith   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4324ace3abfcSBarry Smith   PetscBool       flag;
43258758e1faSBarry Smith   const PetscInt *idx, *idy, *ii;
43264c1414c8SBarry Smith 
43274c1414c8SBarry Smith   PetscFunctionBegin;
4328b215bc84SStefano Zampini   if (!a->inode.use) {
43299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4330*4d12350bSJunchao Zhang     PetscCall(PetscFree(a->inode.size_csr));
43313ba16761SJacob Faibussowitsch     PetscFunctionReturn(PETSC_SUCCESS);
4332b215bc84SStefano Zampini   }
43333ba16761SJacob Faibussowitsch   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
43344c1414c8SBarry Smith 
4335d0f46423SBarry Smith   m = A->rmap->n;
4336*4d12350bSJunchao Zhang   if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
4337*4d12350bSJunchao Zhang   ns    = a->inode.size_csr;
4338*4d12350bSJunchao Zhang   ns[0] = 0;
43394c1414c8SBarry Smith 
43404c1414c8SBarry Smith   i          = 0;
43414c1414c8SBarry Smith   node_count = 0;
43424c1414c8SBarry Smith   idx        = a->j;
43434c1414c8SBarry Smith   ii         = a->i;
43446f2c871aSStefano Zampini   if (idx) {
43454c1414c8SBarry Smith     while (i < m) {            /* For each row */
43464c1414c8SBarry Smith       nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
43474c1414c8SBarry Smith       /* Limits the number of elements in a node to 'a->inode.limit' */
43484c1414c8SBarry Smith       for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
43494c1414c8SBarry Smith         nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
43504c1414c8SBarry Smith         if (nzy != nzx) break;
43514c1414c8SBarry Smith         idy += nzx; /* Same nonzero pattern */
43529566063dSJacob Faibussowitsch         PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
43534c1414c8SBarry Smith         if (!flag) break;
43544c1414c8SBarry Smith       }
4355*4d12350bSJunchao Zhang       ns[node_count + 1] = ns[node_count] + blk_size;
4356*4d12350bSJunchao Zhang       node_count++;
43574c1414c8SBarry Smith       idx += blk_size * nzx;
43584c1414c8SBarry Smith       i = j;
43594c1414c8SBarry Smith     }
43606f2c871aSStefano Zampini   }
43614c1414c8SBarry Smith   /* If not enough inodes found,, do not use inode version of the routines */
43626f2c871aSStefano Zampini   if (!m || !idx || node_count > .8 * m) {
43639566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
4364*4d12350bSJunchao Zhang     PetscCall(PetscFree(a->inode.size_csr));
43659566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
43664c1414c8SBarry Smith   } else {
4367d5f3da31SBarry Smith     if (!A->factortype) {
4368375a6242SBarry Smith       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4369375a6242SBarry Smith       if (A->rmap->n == A->cmap->n) {
43704108e4d5SBarry Smith         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
43714108e4d5SBarry Smith         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
43724108e4d5SBarry Smith         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
43734108e4d5SBarry Smith         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
43744108e4d5SBarry Smith         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4375375a6242SBarry Smith       }
4376d3ac4fa3SBarry Smith     } else {
4377d3ac4fa3SBarry Smith       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4378d3ac4fa3SBarry Smith     }
43794c1414c8SBarry Smith     a->inode.node_count = node_count;
43809566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
43814c1414c8SBarry Smith   }
4382be6adb11SBarry Smith   a->inode.checked          = PETSC_TRUE;
4383a02bda8eSBarry Smith   a->inode.mat_nonzerostate = A->nonzerostate;
43843ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
43854c1414c8SBarry Smith }
43864c1414c8SBarry Smith 
4387d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4388d71ae5a4SJacob Faibussowitsch {
4389150f0143SBarry Smith   Mat         B = *C;
4390150f0143SBarry Smith   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4391150f0143SBarry Smith   PetscInt    m = A->rmap->n;
4392150f0143SBarry Smith 
4393150f0143SBarry Smith   PetscFunctionBegin;
4394150f0143SBarry Smith   c->inode.use              = a->inode.use;
4395150f0143SBarry Smith   c->inode.limit            = a->inode.limit;
4396150f0143SBarry Smith   c->inode.max_limit        = a->inode.max_limit;
4397ec710b6aSStefano Zampini   c->inode.checked          = PETSC_FALSE;
4398*4d12350bSJunchao Zhang   c->inode.size_csr         = NULL;
4399ec710b6aSStefano Zampini   c->inode.node_count       = 0;
4400ec710b6aSStefano Zampini   c->inode.ibdiagvalid      = PETSC_FALSE;
4401ec710b6aSStefano Zampini   c->inode.ibdiag           = NULL;
4402ec710b6aSStefano Zampini   c->inode.bdiag            = NULL;
4403ec710b6aSStefano Zampini   c->inode.mat_nonzerostate = -1;
4404b215bc84SStefano Zampini   if (a->inode.use) {
4405*4d12350bSJunchao Zhang     if (a->inode.checked && a->inode.size_csr) {
4406*4d12350bSJunchao Zhang       PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
4407*4d12350bSJunchao Zhang       PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4408ec710b6aSStefano Zampini 
4409ec710b6aSStefano Zampini       c->inode.checked          = PETSC_TRUE;
4410ec710b6aSStefano Zampini       c->inode.node_count       = a->inode.node_count;
4411ec710b6aSStefano Zampini       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4412ec710b6aSStefano Zampini     }
4413a02bda8eSBarry Smith     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
44142c451681SBarry Smith     if (!B->factortype) {
44152c451681SBarry Smith       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
44162c451681SBarry Smith       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
44172c451681SBarry Smith       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
44182c451681SBarry Smith       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
44192c451681SBarry Smith       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
44202c451681SBarry Smith       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4421150f0143SBarry Smith     } else {
44222c451681SBarry Smith       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4423150f0143SBarry Smith     }
4424150f0143SBarry Smith   }
44253ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4426150f0143SBarry Smith }
4427150f0143SBarry Smith 
4428d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4429d71ae5a4SJacob Faibussowitsch {
44308758e1faSBarry Smith   PetscInt        k;
44318758e1faSBarry Smith   const PetscInt *vi;
44326e111a19SKarl Rupp 
443317454e89SShri Abhyankar   PetscFunctionBegin;
443417454e89SShri Abhyankar   vi = aj + ai[row];
443517454e89SShri Abhyankar   for (k = 0; k < nzl; k++) cols[k] = vi[k];
443617454e89SShri Abhyankar   vi        = aj + adiag[row];
443717454e89SShri Abhyankar   cols[nzl] = vi[0];
443817454e89SShri Abhyankar   vi        = aj + adiag[row + 1] + 1;
443917454e89SShri Abhyankar   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
44403ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
444117454e89SShri Abhyankar }
44426936b636SHong Zhang /*
4443a02bda8eSBarry Smith    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4444a02bda8eSBarry Smith    Modified from MatSeqAIJCheckInode().
44456936b636SHong Zhang 
44466936b636SHong Zhang    Input Parameters:
4447abb87a52SBarry Smith .  Mat A - ILU or LU matrix factor
4448abb87a52SBarry Smith 
44496936b636SHong Zhang */
4450d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4451d71ae5a4SJacob Faibussowitsch {
4452019b515eSShri Abhyankar   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4453019b515eSShri Abhyankar   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
44548758e1faSBarry Smith   PetscInt       *cols1, *cols2, *ns;
44558758e1faSBarry Smith   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4456ace3abfcSBarry Smith   PetscBool       flag;
4457019b515eSShri Abhyankar 
4458019b515eSShri Abhyankar   PetscFunctionBegin;
44593ba16761SJacob Faibussowitsch   if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
44603ba16761SJacob Faibussowitsch   if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4461019b515eSShri Abhyankar 
4462019b515eSShri Abhyankar   m = A->rmap->n;
4463*4d12350bSJunchao Zhang   if (a->inode.size_csr) ns = a->inode.size_csr;
446448a46eb9SPierre Jolivet   else PetscCall(PetscMalloc1(m + 1, &ns));
4465*4d12350bSJunchao Zhang   ns[0] = 0;
4466019b515eSShri Abhyankar 
4467019b515eSShri Abhyankar   i          = 0;
4468019b515eSShri Abhyankar   node_count = 0;
44699566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4470019b515eSShri Abhyankar   while (i < m) {                       /* For each row */
4471019b515eSShri Abhyankar     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4472019b515eSShri Abhyankar     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4473019b515eSShri Abhyankar     nzx  = nzl1 + nzu1 + 1;
44743ba16761SJacob Faibussowitsch     PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4475019b515eSShri Abhyankar 
4476019b515eSShri Abhyankar     /* Limits the number of elements in a node to 'a->inode.limit' */
4477019b515eSShri Abhyankar     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4478019b515eSShri Abhyankar       nzl2 = ai[j + 1] - ai[j];
4479019b515eSShri Abhyankar       nzu2 = adiag[j] - adiag[j + 1] - 1;
4480019b515eSShri Abhyankar       nzy  = nzl2 + nzu2 + 1;
4481019b515eSShri Abhyankar       if (nzy != nzx) break;
44829566063dSJacob Faibussowitsch       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
44839566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
44848758e1faSBarry Smith       if (!flag) break;
4485019b515eSShri Abhyankar     }
4486*4d12350bSJunchao Zhang     ns[node_count + 1] = ns[node_count] + blk_size;
4487*4d12350bSJunchao Zhang     node_count++;
4488019b515eSShri Abhyankar     i = j;
4489019b515eSShri Abhyankar   }
44909566063dSJacob Faibussowitsch   PetscCall(PetscFree2(cols1, cols2));
4491019b515eSShri Abhyankar   /* If not enough inodes found,, do not use inode version of the routines */
4492be6adb11SBarry Smith   if (!m || node_count > .8 * m) {
44939566063dSJacob Faibussowitsch     PetscCall(PetscFree(ns));
44942205254eSKarl Rupp 
4495019b515eSShri Abhyankar     a->inode.node_count = 0;
4496*4d12350bSJunchao Zhang     a->inode.size_csr   = NULL;
4497019b515eSShri Abhyankar     a->inode.use        = PETSC_FALSE;
44982205254eSKarl Rupp 
44999566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4500019b515eSShri Abhyankar   } else {
4501f4259b30SLisandro Dalcin     A->ops->mult              = NULL;
4502f4259b30SLisandro Dalcin     A->ops->sor               = NULL;
4503f4259b30SLisandro Dalcin     A->ops->multadd           = NULL;
4504f4259b30SLisandro Dalcin     A->ops->getrowij          = NULL;
4505f4259b30SLisandro Dalcin     A->ops->restorerowij      = NULL;
4506f4259b30SLisandro Dalcin     A->ops->getcolumnij       = NULL;
4507f4259b30SLisandro Dalcin     A->ops->restorecolumnij   = NULL;
4508f4259b30SLisandro Dalcin     A->ops->coloringpatch     = NULL;
4509f4259b30SLisandro Dalcin     A->ops->multdiagonalblock = NULL;
4510019b515eSShri Abhyankar     a->inode.node_count       = node_count;
4511*4d12350bSJunchao Zhang     a->inode.size_csr         = ns;
45129566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4513019b515eSShri Abhyankar   }
4514be6adb11SBarry Smith   a->inode.checked = PETSC_TRUE;
45153ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4516019b515eSShri Abhyankar }
4517019b515eSShri Abhyankar 
4518d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4519d71ae5a4SJacob Faibussowitsch {
4520acf2f550SJed Brown   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4521acf2f550SJed Brown 
4522acf2f550SJed Brown   PetscFunctionBegin;
4523acf2f550SJed Brown   a->inode.ibdiagvalid = PETSC_FALSE;
45243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4525acf2f550SJed Brown }
4526acf2f550SJed Brown 
45274c1414c8SBarry Smith /*
45284c1414c8SBarry Smith      This is really ugly. if inodes are used this replaces the
45294c1414c8SBarry Smith   permutations with ones that correspond to rows/cols of the matrix
4530467446fbSPierre Jolivet   rather than inode blocks
45314c1414c8SBarry Smith */
4532d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4533d71ae5a4SJacob Faibussowitsch {
45344c1414c8SBarry Smith   PetscFunctionBegin;
4535cac4c232SBarry Smith   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
45363ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45374c1414c8SBarry Smith }
45384c1414c8SBarry Smith 
4539d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4540d71ae5a4SJacob Faibussowitsch {
45414c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
45425d0c19d7SBarry Smith   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
45435d0c19d7SBarry Smith   const PetscInt *ridx, *cidx;
4544*4d12350bSJunchao Zhang   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
45454c1414c8SBarry Smith   PetscInt        nslim_col, *ns_col;
45464c1414c8SBarry Smith   IS              ris = *rperm, cis = *cperm;
45474c1414c8SBarry Smith 
45484c1414c8SBarry Smith   PetscFunctionBegin;
4549*4d12350bSJunchao Zhang   if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS);       /* no inodes so return */
45503ba16761SJacob Faibussowitsch   if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
45514c1414c8SBarry Smith 
45529566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
455332603206SJames Wright   PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
45549566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &permr, n, &permc));
45554c1414c8SBarry Smith 
45569566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(ris, &ridx));
45579566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(cis, &cidx));
45584c1414c8SBarry Smith 
4559baca6076SPierre Jolivet   /* Form the inode structure for the rows of permuted matrix using inv perm*/
4560*4d12350bSJunchao Zhang   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
45614c1414c8SBarry Smith 
45624c1414c8SBarry Smith   /* Construct the permutations for rows*/
45634c1414c8SBarry Smith   for (i = 0, row = 0; i < nslim_row; ++i) {
45644c1414c8SBarry Smith     indx      = ridx[i];
45654c1414c8SBarry Smith     start_val = tns[indx];
45664c1414c8SBarry Smith     end_val   = tns[indx + 1];
45674c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
45684c1414c8SBarry Smith   }
45694c1414c8SBarry Smith 
45704c1414c8SBarry Smith   /* Form the inode structure for the columns of permuted matrix using inv perm*/
4571*4d12350bSJunchao Zhang   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
45724c1414c8SBarry Smith 
45734c1414c8SBarry Smith   /* Construct permutations for columns */
45744c1414c8SBarry Smith   for (i = 0, col = 0; i < nslim_col; ++i) {
45754c1414c8SBarry Smith     indx      = cidx[i];
45764c1414c8SBarry Smith     start_val = tns[indx];
45774c1414c8SBarry Smith     end_val   = tns[indx + 1];
45784c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
45794c1414c8SBarry Smith   }
45804c1414c8SBarry Smith 
45819566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
45829566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*rperm));
45839566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
45849566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*cperm));
45854c1414c8SBarry Smith 
45869566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(ris, &ridx));
45879566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(cis, &cidx));
45884c1414c8SBarry Smith 
45899566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
45909566063dSJacob Faibussowitsch   PetscCall(PetscFree2(permr, permc));
45919566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&cis));
45929566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&ris));
45939566063dSJacob Faibussowitsch   PetscCall(PetscFree(tns));
45943ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
45954c1414c8SBarry Smith }
45964c1414c8SBarry Smith 
45974c1414c8SBarry Smith /*@C
459811a5261eSBarry Smith   MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
45994c1414c8SBarry Smith 
46003f9fe445SBarry Smith   Not Collective
46014c1414c8SBarry Smith 
46024c1414c8SBarry Smith   Input Parameter:
460311a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
46044c1414c8SBarry Smith 
4605d8d19677SJose E. Roman   Output Parameters:
46064c1414c8SBarry Smith + node_count - no of inodes present in the matrix.
46072ef1f0ffSBarry Smith . sizes      - an array of size `node_count`, with the sizes of each inode.
46084c1414c8SBarry Smith - limit      - the max size used to generate the inodes.
46094c1414c8SBarry Smith 
46104c1414c8SBarry Smith   Level: advanced
46114c1414c8SBarry Smith 
461211a5261eSBarry Smith   Note:
46134c1414c8SBarry Smith   It should be called after the matrix is assembled.
46144c1414c8SBarry Smith   The contents of the sizes[] array should not be changed.
46152ef1f0ffSBarry Smith   `NULL` may be passed for information not needed
46164c1414c8SBarry Smith 
46171cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
46184c1414c8SBarry Smith @*/
4619d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4620d71ae5a4SJacob Faibussowitsch {
46215f80ce2aSJacob Faibussowitsch   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
46224c1414c8SBarry Smith 
46234c1414c8SBarry Smith   PetscFunctionBegin;
46245f80ce2aSJacob Faibussowitsch   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
46259566063dSJacob Faibussowitsch   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
46269566063dSJacob Faibussowitsch   if (f) PetscCall((*f)(A, node_count, sizes, limit));
46273ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46284c1414c8SBarry Smith }
46294c1414c8SBarry Smith 
4630d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4631d71ae5a4SJacob Faibussowitsch {
46324c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
46334c1414c8SBarry Smith 
46344c1414c8SBarry Smith   PetscFunctionBegin;
46354c1414c8SBarry Smith   if (node_count) *node_count = a->inode.node_count;
4636*4d12350bSJunchao Zhang   if (sizes) *sizes = a->inode.size_csr;
46374c1414c8SBarry Smith   if (limit) *limit = a->inode.limit;
46383ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
46394c1414c8SBarry Smith }
4640