xref: /petsc/src/mat/impls/aij/seq/inode.c (revision 35cb6cd333087cc89d8d5031932d4f38af02614d)
14c1414c8SBarry Smith 
24c1414c8SBarry Smith /*
34c1414c8SBarry Smith   This file provides high performance routines for the Inode format (compressed sparse row)
44c1414c8SBarry Smith   by taking advantage of rows with identical nonzero structure (I-nodes).
54c1414c8SBarry Smith */
6c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>
74c1414c8SBarry Smith 
8d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
9d71ae5a4SJacob Faibussowitsch {
104c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
114c1414c8SBarry Smith   PetscInt    i, count, m, n, min_mn, *ns_row, *ns_col;
124c1414c8SBarry Smith 
134c1414c8SBarry Smith   PetscFunctionBegin;
14d0f46423SBarry Smith   n = A->cmap->n;
15d0f46423SBarry Smith   m = A->rmap->n;
1608401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
174c1414c8SBarry Smith   ns_row = a->inode.size;
184c1414c8SBarry Smith 
194c1414c8SBarry Smith   min_mn = (m < n) ? m : n;
204c1414c8SBarry Smith   if (!ns) {
219371c9d4SSatish Balay     for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++)
229371c9d4SSatish Balay       ;
239371c9d4SSatish Balay     for (; count + 1 < n; count++, i++)
249371c9d4SSatish Balay       ;
25ad540459SPierre Jolivet     if (count < n) i++;
264c1414c8SBarry Smith     *size = i;
274c1414c8SBarry Smith     PetscFunctionReturn(0);
284c1414c8SBarry Smith   }
299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &ns_col));
304c1414c8SBarry Smith 
314c1414c8SBarry Smith   /* Use the same row structure wherever feasible. */
32ad540459SPierre Jolivet   for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i];
334c1414c8SBarry Smith 
344c1414c8SBarry Smith   /* if m < n; pad up the remainder with inode_limit */
35ad540459SPierre Jolivet   for (; count + 1 < n; count++, i++) ns_col[i] = 1;
364c1414c8SBarry Smith   /* The last node is the odd ball. padd it up with the remaining rows; */
374c1414c8SBarry Smith   if (count < n) {
384c1414c8SBarry Smith     ns_col[i] = n - count;
394c1414c8SBarry Smith     i++;
404c1414c8SBarry Smith   } else if (count > n) {
414c1414c8SBarry Smith     /* Adjust for the over estimation */
424c1414c8SBarry Smith     ns_col[i - 1] += n - count;
434c1414c8SBarry Smith   }
444c1414c8SBarry Smith   *size = i;
454c1414c8SBarry Smith   *ns   = ns_col;
464c1414c8SBarry Smith   PetscFunctionReturn(0);
474c1414c8SBarry Smith }
484c1414c8SBarry Smith 
494c1414c8SBarry Smith /*
504c1414c8SBarry Smith       This builds symmetric version of nonzero structure,
514c1414c8SBarry Smith */
52d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
53d71ae5a4SJacob Faibussowitsch {
544c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
558758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
568758e1faSBarry Smith   PetscInt       *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2;
578758e1faSBarry Smith   const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
584c1414c8SBarry Smith 
594c1414c8SBarry Smith   PetscFunctionBegin;
604c1414c8SBarry Smith   nslim_row = a->inode.node_count;
61d0f46423SBarry Smith   m         = A->rmap->n;
62d0f46423SBarry Smith   n         = A->cmap->n;
6308401ef6SPierre Jolivet   PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
6408401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
654c1414c8SBarry Smith 
664c1414c8SBarry Smith   /* Use the row_inode as column_inode */
674c1414c8SBarry Smith   nslim_col = nslim_row;
684c1414c8SBarry Smith   ns_col    = ns_row;
694c1414c8SBarry Smith 
70*35cb6cd3SPierre Jolivet   /* allocate space for reformatted inode structure */
719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
724c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1];
734c1414c8SBarry Smith 
744c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
754c1414c8SBarry Smith     nsz = ns_col[i1];
762205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
774c1414c8SBarry Smith   }
784c1414c8SBarry Smith   /* allocate space for row pointers */
799566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
804c1414c8SBarry Smith   *iia = ia;
819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
824c1414c8SBarry Smith 
834c1414c8SBarry Smith   /* determine the number of columns in each row */
844c1414c8SBarry Smith   ia[0] = oshift;
854c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
864c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
874c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
8883fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
894c1414c8SBarry Smith     col = *j++ + ishift;
904c1414c8SBarry Smith     i2  = tvc[col];
916aad120cSJose E. Roman     while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
924c1414c8SBarry Smith       ia[i1 + 1]++;
934c1414c8SBarry Smith       ia[i2 + 1]++;
944c1414c8SBarry Smith       i2++; /* Start col of next node */
9590d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
964c1414c8SBarry Smith       i2 = tvc[col];
974c1414c8SBarry Smith     }
984c1414c8SBarry Smith     if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
994c1414c8SBarry Smith   }
1004c1414c8SBarry Smith 
1014c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1024c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1034c1414c8SBarry Smith     row = ia[i1 - 1];
1044c1414c8SBarry Smith     ia[i1] += row;
1054c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1064c1414c8SBarry Smith   }
1074c1414c8SBarry Smith 
1084c1414c8SBarry Smith   /* allocate space for column pointers */
1094c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1109566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1114c1414c8SBarry Smith   *jja = ja;
1124c1414c8SBarry Smith 
1134c1414c8SBarry Smith   /* loop over lower triangular part putting into ja */
1144c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1154c1414c8SBarry Smith     j    = aj + ai[row] + ishift;
1164c1414c8SBarry Smith     jmax = aj + ai[row + 1] + ishift;
11783fed2edSSatish Balay     if (j == jmax) continue; /* empty row */
1184c1414c8SBarry Smith     col = *j++ + ishift;
1194c1414c8SBarry Smith     i2  = tvc[col];
1204c1414c8SBarry Smith     while (i2 < i1 && j < jmax) {
1214c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
1224c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
1234c1414c8SBarry Smith       ++i2;
12490d2dec7SBarry Smith       while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
1254c1414c8SBarry Smith       i2 = tvc[col];
1264c1414c8SBarry Smith     }
1274c1414c8SBarry Smith     if (i2 == i1) ja[work[i1]++] = i2 + oshift;
1284c1414c8SBarry Smith   }
1299566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
1309566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
1314c1414c8SBarry Smith   PetscFunctionReturn(0);
1324c1414c8SBarry Smith }
1334c1414c8SBarry Smith 
1344c1414c8SBarry Smith /*
1354c1414c8SBarry Smith       This builds nonsymmetric version of nonzero structure,
1364c1414c8SBarry Smith */
137d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
138d71ae5a4SJacob Faibussowitsch {
1394c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
1408758e1faSBarry Smith   PetscInt       *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
1418758e1faSBarry Smith   PetscInt       *tns, *tvc, nsz, i1, i2;
1428758e1faSBarry Smith   const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size;
1434c1414c8SBarry Smith 
1444c1414c8SBarry Smith   PetscFunctionBegin;
14508401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1464c1414c8SBarry Smith   nslim_row = a->inode.node_count;
147d0f46423SBarry Smith   n         = A->cmap->n;
1484c1414c8SBarry Smith 
1494c1414c8SBarry Smith   /* Create The column_inode for this matrix */
1509566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
1514c1414c8SBarry Smith 
152*35cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
1539566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
1544c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
1554c1414c8SBarry Smith 
1564c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
1574c1414c8SBarry Smith     nsz = ns_col[i1];
1582205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
1594c1414c8SBarry Smith   }
1604c1414c8SBarry Smith   /* allocate space for row pointers */
1619566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_row + 1, &ia));
1624c1414c8SBarry Smith   *iia = ia;
1639566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_row + 1, &work));
1644c1414c8SBarry Smith 
1654c1414c8SBarry Smith   /* determine the number of columns in each row */
1664c1414c8SBarry Smith   ia[0] = oshift;
1674c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1684c1414c8SBarry Smith     j  = aj + ai[row] + ishift;
16983fed2edSSatish Balay     nz = ai[row + 1] - ai[row];
17083fed2edSSatish Balay     if (!nz) continue; /* empty row */
1714c1414c8SBarry Smith     col = *j++ + ishift;
1724c1414c8SBarry Smith     i2  = tvc[col];
1736aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
1744c1414c8SBarry Smith       ia[i1 + 1]++;
1754c1414c8SBarry Smith       i2++; /* Start col of next node */
176a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
1774c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
1784c1414c8SBarry Smith     }
1794c1414c8SBarry Smith   }
1804c1414c8SBarry Smith 
1814c1414c8SBarry Smith   /* shift ia[i] to point to next row */
1824c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_row + 1; i1++) {
1834c1414c8SBarry Smith     row = ia[i1 - 1];
1844c1414c8SBarry Smith     ia[i1] += row;
1854c1414c8SBarry Smith     work[i1 - 1] = row - oshift;
1864c1414c8SBarry Smith   }
1874c1414c8SBarry Smith 
1884c1414c8SBarry Smith   /* allocate space for column pointers */
1894c1414c8SBarry Smith   nz = ia[nslim_row] + (!ishift);
1909566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
1914c1414c8SBarry Smith   *jja = ja;
1924c1414c8SBarry Smith 
1934c1414c8SBarry Smith   /* loop over matrix putting into ja */
1944c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
1954c1414c8SBarry Smith     j  = aj + ai[row] + ishift;
19683fed2edSSatish Balay     nz = ai[row + 1] - ai[row];
19783fed2edSSatish Balay     if (!nz) continue; /* empty row */
1984c1414c8SBarry Smith     col = *j++ + ishift;
1994c1414c8SBarry Smith     i2  = tvc[col];
2004c1414c8SBarry Smith     while (nz-- > 0) {
2014c1414c8SBarry Smith       ja[work[i1]++] = i2 + oshift;
2024c1414c8SBarry Smith       ++i2;
203a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2044c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2054c1414c8SBarry Smith     }
2064c1414c8SBarry Smith   }
2079566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
2089566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
2099566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
2104c1414c8SBarry Smith   PetscFunctionReturn(0);
2114c1414c8SBarry Smith }
2124c1414c8SBarry Smith 
213d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
214d71ae5a4SJacob Faibussowitsch {
2154c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2164c1414c8SBarry Smith 
2174c1414c8SBarry Smith   PetscFunctionBegin;
21850ba90b4SBarry Smith   if (n) *n = a->inode.node_count;
2194c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
2208f7157efSSatish Balay   if (!blockcompressed) {
2219566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2228f7157efSSatish Balay   } else if (symmetric) {
2239566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
2244c1414c8SBarry Smith   } else {
2259566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
2264c1414c8SBarry Smith   }
2274c1414c8SBarry Smith   PetscFunctionReturn(0);
2284c1414c8SBarry Smith }
2294c1414c8SBarry Smith 
230d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
231d71ae5a4SJacob Faibussowitsch {
2324c1414c8SBarry Smith   PetscFunctionBegin;
2334c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
2348f7157efSSatish Balay 
2358f7157efSSatish Balay   if (!blockcompressed) {
2369566063dSJacob Faibussowitsch     PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2378f7157efSSatish Balay   } else {
2389566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
2399566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
2408f7157efSSatish Balay   }
2414c1414c8SBarry Smith   PetscFunctionReturn(0);
2424c1414c8SBarry Smith }
2434c1414c8SBarry Smith 
2444c1414c8SBarry Smith /* ----------------------------------------------------------- */
2454c1414c8SBarry Smith 
246d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
247d71ae5a4SJacob Faibussowitsch {
2484c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2494c1414c8SBarry Smith   PetscInt   *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
2504c1414c8SBarry Smith   PetscInt   *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j;
2514c1414c8SBarry Smith 
2524c1414c8SBarry Smith   PetscFunctionBegin;
25308401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2544c1414c8SBarry Smith   nslim_row = a->inode.node_count;
255d0f46423SBarry Smith   n         = A->cmap->n;
2564c1414c8SBarry Smith 
2574c1414c8SBarry Smith   /* Create The column_inode for this matrix */
2589566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
2594c1414c8SBarry Smith 
260*35cb6cd3SPierre Jolivet   /* allocate space for reformatted column_inode structure */
2619566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
2624c1414c8SBarry Smith   for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1];
2634c1414c8SBarry Smith 
2644c1414c8SBarry Smith   for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
2654c1414c8SBarry Smith     nsz = ns_col[i1];
2662205254eSKarl Rupp     for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
2674c1414c8SBarry Smith   }
2684c1414c8SBarry Smith   /* allocate space for column pointers */
2699566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(nslim_col + 1, &ia));
2704c1414c8SBarry Smith   *iia = ia;
2719566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nslim_col + 1, &work));
2724c1414c8SBarry Smith 
2734c1414c8SBarry Smith   /* determine the number of columns in each row */
2744c1414c8SBarry Smith   ia[0] = oshift;
2754c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
2764c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
2774c1414c8SBarry Smith     col = *j++ + ishift;
2784c1414c8SBarry Smith     i2  = tvc[col];
2794c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
2806aad120cSJose E. Roman     while (nz-- > 0) { /* off-diagonal elements */
2814c1414c8SBarry Smith       /* ia[i1+1]++; */
2824c1414c8SBarry Smith       ia[i2 + 1]++;
2834c1414c8SBarry Smith       i2++;
284a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2854c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
2864c1414c8SBarry Smith     }
2874c1414c8SBarry Smith   }
2884c1414c8SBarry Smith 
2894c1414c8SBarry Smith   /* shift ia[i] to point to next col */
2904c1414c8SBarry Smith   for (i1 = 1; i1 < nslim_col + 1; i1++) {
2914c1414c8SBarry Smith     col = ia[i1 - 1];
2924c1414c8SBarry Smith     ia[i1] += col;
2934c1414c8SBarry Smith     work[i1 - 1] = col - oshift;
2944c1414c8SBarry Smith   }
2954c1414c8SBarry Smith 
2964c1414c8SBarry Smith   /* allocate space for column pointers */
2974c1414c8SBarry Smith   nz = ia[nslim_col] + (!ishift);
2989566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(nz, &ja));
2994c1414c8SBarry Smith   *jja = ja;
3004c1414c8SBarry Smith 
3014c1414c8SBarry Smith   /* loop over matrix putting into ja */
3024c1414c8SBarry Smith   for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) {
3034c1414c8SBarry Smith     j   = aj + ai[row] + ishift;
3044c1414c8SBarry Smith     col = *j++ + ishift;
3054c1414c8SBarry Smith     i2  = tvc[col];
3064c1414c8SBarry Smith     nz  = ai[row + 1] - ai[row];
3074c1414c8SBarry Smith     while (nz-- > 0) {
3084c1414c8SBarry Smith       /* ja[work[i1]++] = i2 + oshift; */
3094c1414c8SBarry Smith       ja[work[i2]++] = i1 + oshift;
3104c1414c8SBarry Smith       i2++;
311a8e3a797SJed Brown       while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
3124c1414c8SBarry Smith       if (nz > 0) i2 = tvc[col];
3134c1414c8SBarry Smith     }
3144c1414c8SBarry Smith   }
3159566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
3169566063dSJacob Faibussowitsch   PetscCall(PetscFree(work));
3179566063dSJacob Faibussowitsch   PetscCall(PetscFree2(tns, tvc));
3184c1414c8SBarry Smith   PetscFunctionReturn(0);
3194c1414c8SBarry Smith }
3204c1414c8SBarry Smith 
321d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
322d71ae5a4SJacob Faibussowitsch {
3234c1414c8SBarry Smith   PetscFunctionBegin;
3249566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, n, NULL));
3254c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
3264c1414c8SBarry Smith 
3278f7157efSSatish Balay   if (!blockcompressed) {
3289566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3298f7157efSSatish Balay   } else if (symmetric) {
330a5b23f4aSJose E. Roman     /* Since the indices are symmetric it doesn't matter */
3319566063dSJacob Faibussowitsch     PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
3324c1414c8SBarry Smith   } else {
3339566063dSJacob Faibussowitsch     PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
3344c1414c8SBarry Smith   }
3354c1414c8SBarry Smith   PetscFunctionReturn(0);
3364c1414c8SBarry Smith }
3374c1414c8SBarry Smith 
338d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
339d71ae5a4SJacob Faibussowitsch {
3404c1414c8SBarry Smith   PetscFunctionBegin;
3414c1414c8SBarry Smith   if (!ia) PetscFunctionReturn(0);
3428f7157efSSatish Balay   if (!blockcompressed) {
3439566063dSJacob Faibussowitsch     PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3448f7157efSSatish Balay   } else {
3459566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ia));
3469566063dSJacob Faibussowitsch     PetscCall(PetscFree(*ja));
3478f7157efSSatish Balay   }
3484c1414c8SBarry Smith   PetscFunctionReturn(0);
3494c1414c8SBarry Smith }
3504c1414c8SBarry Smith 
3514c1414c8SBarry Smith /* ----------------------------------------------------------- */
3524c1414c8SBarry Smith 
353d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
354d71ae5a4SJacob Faibussowitsch {
3554c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
3564c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
357d9fead3dSBarry Smith   PetscScalar       *y;
358dd6ea824SBarry Smith   const PetscScalar *x;
359dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
3608758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0;
3618758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
3624c1414c8SBarry Smith 
3634c1414c8SBarry Smith #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
3644c1414c8SBarry Smith   #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
3654c1414c8SBarry Smith #endif
3664c1414c8SBarry Smith 
3674c1414c8SBarry Smith   PetscFunctionBegin;
36808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3694c1414c8SBarry Smith   node_max = a->inode.node_count;
3704c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
3719566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
3729566063dSJacob Faibussowitsch   PetscCall(VecGetArray(yy, &y));
3734c1414c8SBarry Smith   idx = a->j;
3744c1414c8SBarry Smith   v1  = a->a;
3754c1414c8SBarry Smith   ii  = a->i;
3764c1414c8SBarry Smith 
3774c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
3784c1414c8SBarry Smith     nsz = ns[i];
3794c1414c8SBarry Smith     n   = ii[1] - ii[0];
38098c9bda7SSatish Balay     nonzerorow += (n > 0) * nsz;
3814c1414c8SBarry Smith     ii += nsz;
38250d8bf02SJed Brown     PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA);      /* Prefetch the indices for the block row after the current one */
38350d8bf02SJed Brown     PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one  */
3844c1414c8SBarry Smith     sz = n;                                                                /* No of non zeros in this row */
3854c1414c8SBarry Smith                                                                            /* Switch on the size of Node */
3864c1414c8SBarry Smith     switch (nsz) {                                                         /* Each loop in 'case' is unrolled */
3874c1414c8SBarry Smith     case 1:
38875567043SBarry Smith       sum1 = 0.;
3894c1414c8SBarry Smith 
3904c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
3914c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
3924c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
3934c1414c8SBarry Smith         idx += 2;
3944c1414c8SBarry Smith         tmp0 = x[i1];
3954c1414c8SBarry Smith         tmp1 = x[i2];
3969371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
3979371c9d4SSatish Balay         v1 += 2;
3984c1414c8SBarry Smith       }
3994c1414c8SBarry Smith 
4004c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
4014c1414c8SBarry Smith         tmp0 = x[*idx++];
4024c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4034c1414c8SBarry Smith       }
4044c1414c8SBarry Smith       y[row++] = sum1;
4054c1414c8SBarry Smith       break;
4064c1414c8SBarry Smith     case 2:
40775567043SBarry Smith       sum1 = 0.;
40875567043SBarry Smith       sum2 = 0.;
4094c1414c8SBarry Smith       v2   = v1 + n;
4104c1414c8SBarry Smith 
4114c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4124c1414c8SBarry Smith         i1 = idx[0];
4134c1414c8SBarry Smith         i2 = idx[1];
4144c1414c8SBarry Smith         idx += 2;
4154c1414c8SBarry Smith         tmp0 = x[i1];
4164c1414c8SBarry Smith         tmp1 = x[i2];
4179371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4189371c9d4SSatish Balay         v1 += 2;
4199371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4209371c9d4SSatish Balay         v2 += 2;
4214c1414c8SBarry Smith       }
4224c1414c8SBarry Smith       if (n == sz - 1) {
4234c1414c8SBarry Smith         tmp0 = x[*idx++];
4244c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4254c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4264c1414c8SBarry Smith       }
4274c1414c8SBarry Smith       y[row++] = sum1;
4284c1414c8SBarry Smith       y[row++] = sum2;
4294c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
4304c1414c8SBarry Smith       idx += sz;
4314c1414c8SBarry Smith       break;
4324c1414c8SBarry Smith     case 3:
43375567043SBarry Smith       sum1 = 0.;
43475567043SBarry Smith       sum2 = 0.;
43575567043SBarry Smith       sum3 = 0.;
4364c1414c8SBarry Smith       v2   = v1 + n;
4374c1414c8SBarry Smith       v3   = v2 + n;
4384c1414c8SBarry Smith 
4394c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4404c1414c8SBarry Smith         i1 = idx[0];
4414c1414c8SBarry Smith         i2 = idx[1];
4424c1414c8SBarry Smith         idx += 2;
4434c1414c8SBarry Smith         tmp0 = x[i1];
4444c1414c8SBarry Smith         tmp1 = x[i2];
4459371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4469371c9d4SSatish Balay         v1 += 2;
4479371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4489371c9d4SSatish Balay         v2 += 2;
4499371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4509371c9d4SSatish Balay         v3 += 2;
4514c1414c8SBarry Smith       }
4524c1414c8SBarry Smith       if (n == sz - 1) {
4534c1414c8SBarry Smith         tmp0 = x[*idx++];
4544c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4554c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4564c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4574c1414c8SBarry Smith       }
4584c1414c8SBarry Smith       y[row++] = sum1;
4594c1414c8SBarry Smith       y[row++] = sum2;
4604c1414c8SBarry Smith       y[row++] = sum3;
4614c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
4624c1414c8SBarry Smith       idx += 2 * sz;
4634c1414c8SBarry Smith       break;
4644c1414c8SBarry Smith     case 4:
46575567043SBarry Smith       sum1 = 0.;
46675567043SBarry Smith       sum2 = 0.;
46775567043SBarry Smith       sum3 = 0.;
46875567043SBarry Smith       sum4 = 0.;
4694c1414c8SBarry Smith       v2   = v1 + n;
4704c1414c8SBarry Smith       v3   = v2 + n;
4714c1414c8SBarry Smith       v4   = v3 + n;
4724c1414c8SBarry Smith 
4734c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
4744c1414c8SBarry Smith         i1 = idx[0];
4754c1414c8SBarry Smith         i2 = idx[1];
4764c1414c8SBarry Smith         idx += 2;
4774c1414c8SBarry Smith         tmp0 = x[i1];
4784c1414c8SBarry Smith         tmp1 = x[i2];
4799371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4809371c9d4SSatish Balay         v1 += 2;
4819371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4829371c9d4SSatish Balay         v2 += 2;
4839371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4849371c9d4SSatish Balay         v3 += 2;
4859371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
4869371c9d4SSatish Balay         v4 += 2;
4874c1414c8SBarry Smith       }
4884c1414c8SBarry Smith       if (n == sz - 1) {
4894c1414c8SBarry Smith         tmp0 = x[*idx++];
4904c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
4914c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
4924c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
4934c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
4944c1414c8SBarry Smith       }
4954c1414c8SBarry Smith       y[row++] = sum1;
4964c1414c8SBarry Smith       y[row++] = sum2;
4974c1414c8SBarry Smith       y[row++] = sum3;
4984c1414c8SBarry Smith       y[row++] = sum4;
4994c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
5004c1414c8SBarry Smith       idx += 3 * sz;
5014c1414c8SBarry Smith       break;
5024c1414c8SBarry Smith     case 5:
50375567043SBarry Smith       sum1 = 0.;
50475567043SBarry Smith       sum2 = 0.;
50575567043SBarry Smith       sum3 = 0.;
50675567043SBarry Smith       sum4 = 0.;
50775567043SBarry Smith       sum5 = 0.;
5084c1414c8SBarry Smith       v2   = v1 + n;
5094c1414c8SBarry Smith       v3   = v2 + n;
5104c1414c8SBarry Smith       v4   = v3 + n;
5114c1414c8SBarry Smith       v5   = v4 + n;
5124c1414c8SBarry Smith 
5134c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5144c1414c8SBarry Smith         i1 = idx[0];
5154c1414c8SBarry Smith         i2 = idx[1];
5164c1414c8SBarry Smith         idx += 2;
5174c1414c8SBarry Smith         tmp0 = x[i1];
5184c1414c8SBarry Smith         tmp1 = x[i2];
5199371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5209371c9d4SSatish Balay         v1 += 2;
5219371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
5229371c9d4SSatish Balay         v2 += 2;
5239371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
5249371c9d4SSatish Balay         v3 += 2;
5259371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
5269371c9d4SSatish Balay         v4 += 2;
5279371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
5289371c9d4SSatish Balay         v5 += 2;
5294c1414c8SBarry Smith       }
5304c1414c8SBarry Smith       if (n == sz - 1) {
5314c1414c8SBarry Smith         tmp0 = x[*idx++];
5324c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
5334c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
5344c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
5354c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
5364c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
5374c1414c8SBarry Smith       }
5384c1414c8SBarry Smith       y[row++] = sum1;
5394c1414c8SBarry Smith       y[row++] = sum2;
5404c1414c8SBarry Smith       y[row++] = sum3;
5414c1414c8SBarry Smith       y[row++] = sum4;
5424c1414c8SBarry Smith       y[row++] = sum5;
5434c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
5444c1414c8SBarry Smith       idx += 4 * sz;
5454c1414c8SBarry Smith       break;
546d71ae5a4SJacob Faibussowitsch     default:
547d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
5484c1414c8SBarry Smith     }
5494c1414c8SBarry Smith   }
5509566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
5519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(yy, &y));
5529566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
5534c1414c8SBarry Smith   PetscFunctionReturn(0);
5544c1414c8SBarry Smith }
5554c1414c8SBarry Smith /* ----------------------------------------------------------- */
5564108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */
557d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
558d71ae5a4SJacob Faibussowitsch {
5594c1414c8SBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
5604c1414c8SBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
5618758e1faSBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5;
5628758e1faSBarry Smith   const PetscScalar *x;
5638758e1faSBarry Smith   PetscScalar       *y, *z, *zt;
5648758e1faSBarry Smith   PetscInt           i1, i2, n, i, row, node_max, nsz, sz;
5658758e1faSBarry Smith   const PetscInt    *idx, *ns, *ii;
5664c1414c8SBarry Smith 
5674c1414c8SBarry Smith   PetscFunctionBegin;
56808401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
5694c1414c8SBarry Smith   node_max = a->inode.node_count;
5704c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
5712205254eSKarl Rupp 
5729566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(xx, &x));
5739566063dSJacob Faibussowitsch   PetscCall(VecGetArrayPair(zz, yy, &z, &y));
5744c1414c8SBarry Smith   zt = z;
5754c1414c8SBarry Smith 
5764c1414c8SBarry Smith   idx = a->j;
5774c1414c8SBarry Smith   v1  = a->a;
5784c1414c8SBarry Smith   ii  = a->i;
5794c1414c8SBarry Smith 
5804c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
5814c1414c8SBarry Smith     nsz = ns[i];
5824c1414c8SBarry Smith     n   = ii[1] - ii[0];
5834c1414c8SBarry Smith     ii += nsz;
5844c1414c8SBarry Smith     sz = n;        /* No of non zeros in this row */
5854c1414c8SBarry Smith                    /* Switch on the size of Node */
5864c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
5874c1414c8SBarry Smith     case 1:
5884c1414c8SBarry Smith       sum1 = *zt++;
5894c1414c8SBarry Smith 
5904c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
5914c1414c8SBarry Smith         i1 = idx[0]; /* The instructions are ordered to */
5924c1414c8SBarry Smith         i2 = idx[1]; /* make the compiler's job easy */
5934c1414c8SBarry Smith         idx += 2;
5944c1414c8SBarry Smith         tmp0 = x[i1];
5954c1414c8SBarry Smith         tmp1 = x[i2];
5969371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5979371c9d4SSatish Balay         v1 += 2;
5984c1414c8SBarry Smith       }
5994c1414c8SBarry Smith 
6004c1414c8SBarry Smith       if (n == sz - 1) { /* Take care of the last nonzero  */
6014c1414c8SBarry Smith         tmp0 = x[*idx++];
6024c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6034c1414c8SBarry Smith       }
6044c1414c8SBarry Smith       y[row++] = sum1;
6054c1414c8SBarry Smith       break;
6064c1414c8SBarry Smith     case 2:
6074c1414c8SBarry Smith       sum1 = *zt++;
6084c1414c8SBarry Smith       sum2 = *zt++;
6094c1414c8SBarry Smith       v2   = v1 + n;
6104c1414c8SBarry Smith 
6114c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6124c1414c8SBarry Smith         i1 = idx[0];
6134c1414c8SBarry Smith         i2 = idx[1];
6144c1414c8SBarry Smith         idx += 2;
6154c1414c8SBarry Smith         tmp0 = x[i1];
6164c1414c8SBarry Smith         tmp1 = x[i2];
6179371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6189371c9d4SSatish Balay         v1 += 2;
6199371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6209371c9d4SSatish Balay         v2 += 2;
6214c1414c8SBarry Smith       }
6224c1414c8SBarry Smith       if (n == sz - 1) {
6234c1414c8SBarry Smith         tmp0 = x[*idx++];
6244c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6254c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6264c1414c8SBarry Smith       }
6274c1414c8SBarry Smith       y[row++] = sum1;
6284c1414c8SBarry Smith       y[row++] = sum2;
6294c1414c8SBarry Smith       v1       = v2; /* Since the next block to be processed starts there*/
6304c1414c8SBarry Smith       idx += sz;
6314c1414c8SBarry Smith       break;
6324c1414c8SBarry Smith     case 3:
6334c1414c8SBarry Smith       sum1 = *zt++;
6344c1414c8SBarry Smith       sum2 = *zt++;
6354c1414c8SBarry Smith       sum3 = *zt++;
6364c1414c8SBarry Smith       v2   = v1 + n;
6374c1414c8SBarry Smith       v3   = v2 + n;
6384c1414c8SBarry Smith 
6394c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6404c1414c8SBarry Smith         i1 = idx[0];
6414c1414c8SBarry Smith         i2 = idx[1];
6424c1414c8SBarry Smith         idx += 2;
6434c1414c8SBarry Smith         tmp0 = x[i1];
6444c1414c8SBarry Smith         tmp1 = x[i2];
6459371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6469371c9d4SSatish Balay         v1 += 2;
6479371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6489371c9d4SSatish Balay         v2 += 2;
6499371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6509371c9d4SSatish Balay         v3 += 2;
6514c1414c8SBarry Smith       }
6524c1414c8SBarry Smith       if (n == sz - 1) {
6534c1414c8SBarry Smith         tmp0 = x[*idx++];
6544c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6554c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6564c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6574c1414c8SBarry Smith       }
6584c1414c8SBarry Smith       y[row++] = sum1;
6594c1414c8SBarry Smith       y[row++] = sum2;
6604c1414c8SBarry Smith       y[row++] = sum3;
6614c1414c8SBarry Smith       v1       = v3; /* Since the next block to be processed starts there*/
6624c1414c8SBarry Smith       idx += 2 * sz;
6634c1414c8SBarry Smith       break;
6644c1414c8SBarry Smith     case 4:
6654c1414c8SBarry Smith       sum1 = *zt++;
6664c1414c8SBarry Smith       sum2 = *zt++;
6674c1414c8SBarry Smith       sum3 = *zt++;
6684c1414c8SBarry Smith       sum4 = *zt++;
6694c1414c8SBarry Smith       v2   = v1 + n;
6704c1414c8SBarry Smith       v3   = v2 + n;
6714c1414c8SBarry Smith       v4   = v3 + n;
6724c1414c8SBarry Smith 
6734c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
6744c1414c8SBarry Smith         i1 = idx[0];
6754c1414c8SBarry Smith         i2 = idx[1];
6764c1414c8SBarry Smith         idx += 2;
6774c1414c8SBarry Smith         tmp0 = x[i1];
6784c1414c8SBarry Smith         tmp1 = x[i2];
6799371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6809371c9d4SSatish Balay         v1 += 2;
6819371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6829371c9d4SSatish Balay         v2 += 2;
6839371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6849371c9d4SSatish Balay         v3 += 2;
6859371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
6869371c9d4SSatish Balay         v4 += 2;
6874c1414c8SBarry Smith       }
6884c1414c8SBarry Smith       if (n == sz - 1) {
6894c1414c8SBarry Smith         tmp0 = x[*idx++];
6904c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
6914c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
6924c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
6934c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
6944c1414c8SBarry Smith       }
6954c1414c8SBarry Smith       y[row++] = sum1;
6964c1414c8SBarry Smith       y[row++] = sum2;
6974c1414c8SBarry Smith       y[row++] = sum3;
6984c1414c8SBarry Smith       y[row++] = sum4;
6994c1414c8SBarry Smith       v1       = v4; /* Since the next block to be processed starts there*/
7004c1414c8SBarry Smith       idx += 3 * sz;
7014c1414c8SBarry Smith       break;
7024c1414c8SBarry Smith     case 5:
7034c1414c8SBarry Smith       sum1 = *zt++;
7044c1414c8SBarry Smith       sum2 = *zt++;
7054c1414c8SBarry Smith       sum3 = *zt++;
7064c1414c8SBarry Smith       sum4 = *zt++;
7074c1414c8SBarry Smith       sum5 = *zt++;
7084c1414c8SBarry Smith       v2   = v1 + n;
7094c1414c8SBarry Smith       v3   = v2 + n;
7104c1414c8SBarry Smith       v4   = v3 + n;
7114c1414c8SBarry Smith       v5   = v4 + n;
7124c1414c8SBarry Smith 
7134c1414c8SBarry Smith       for (n = 0; n < sz - 1; n += 2) {
7144c1414c8SBarry Smith         i1 = idx[0];
7154c1414c8SBarry Smith         i2 = idx[1];
7164c1414c8SBarry Smith         idx += 2;
7174c1414c8SBarry Smith         tmp0 = x[i1];
7184c1414c8SBarry Smith         tmp1 = x[i2];
7199371c9d4SSatish Balay         sum1 += v1[0] * tmp0 + v1[1] * tmp1;
7209371c9d4SSatish Balay         v1 += 2;
7219371c9d4SSatish Balay         sum2 += v2[0] * tmp0 + v2[1] * tmp1;
7229371c9d4SSatish Balay         v2 += 2;
7239371c9d4SSatish Balay         sum3 += v3[0] * tmp0 + v3[1] * tmp1;
7249371c9d4SSatish Balay         v3 += 2;
7259371c9d4SSatish Balay         sum4 += v4[0] * tmp0 + v4[1] * tmp1;
7269371c9d4SSatish Balay         v4 += 2;
7279371c9d4SSatish Balay         sum5 += v5[0] * tmp0 + v5[1] * tmp1;
7289371c9d4SSatish Balay         v5 += 2;
7294c1414c8SBarry Smith       }
7304c1414c8SBarry Smith       if (n == sz - 1) {
7314c1414c8SBarry Smith         tmp0 = x[*idx++];
7324c1414c8SBarry Smith         sum1 += *v1++ * tmp0;
7334c1414c8SBarry Smith         sum2 += *v2++ * tmp0;
7344c1414c8SBarry Smith         sum3 += *v3++ * tmp0;
7354c1414c8SBarry Smith         sum4 += *v4++ * tmp0;
7364c1414c8SBarry Smith         sum5 += *v5++ * tmp0;
7374c1414c8SBarry Smith       }
7384c1414c8SBarry Smith       y[row++] = sum1;
7394c1414c8SBarry Smith       y[row++] = sum2;
7404c1414c8SBarry Smith       y[row++] = sum3;
7414c1414c8SBarry Smith       y[row++] = sum4;
7424c1414c8SBarry Smith       y[row++] = sum5;
7434c1414c8SBarry Smith       v1       = v5; /* Since the next block to be processed starts there */
7444c1414c8SBarry Smith       idx += 4 * sz;
7454c1414c8SBarry Smith       break;
746d71ae5a4SJacob Faibussowitsch     default:
747d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
7484c1414c8SBarry Smith     }
7494c1414c8SBarry Smith   }
7509566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(xx, &x));
7519566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
7529566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz));
7534c1414c8SBarry Smith   PetscFunctionReturn(0);
7544c1414c8SBarry Smith }
7554c1414c8SBarry Smith 
7564c1414c8SBarry Smith /* ----------------------------------------------------------- */
757d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
758d71ae5a4SJacob Faibussowitsch {
7594c1414c8SBarry Smith   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
7604c1414c8SBarry Smith   IS                 iscol = a->col, isrow = a->row;
7615d0c19d7SBarry Smith   const PetscInt    *r, *c, *rout, *cout;
7628758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n, nz;
7638758e1faSBarry Smith   PetscInt           node_max, *ns, row, nsz, aii, i0, i1;
7648758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
765d9fead3dSBarry Smith   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
766d9fead3dSBarry Smith   PetscScalar        sum1, sum2, sum3, sum4, sum5;
767dd6ea824SBarry Smith   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
768dd6ea824SBarry Smith   const PetscScalar *b;
7694c1414c8SBarry Smith 
7704c1414c8SBarry Smith   PetscFunctionBegin;
77108401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
7724c1414c8SBarry Smith   node_max = a->inode.node_count;
7734c1414c8SBarry Smith   ns       = a->inode.size; /* Node Size array */
7744c1414c8SBarry Smith 
7759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7769566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
7774c1414c8SBarry Smith   tmp = a->solve_work;
7784c1414c8SBarry Smith 
7799371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7809371c9d4SSatish Balay   r = rout;
7819371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7829371c9d4SSatish Balay   c = cout + (n - 1);
7834c1414c8SBarry Smith 
7844c1414c8SBarry Smith   /* forward solve the lower triangular */
7854c1414c8SBarry Smith   tmps = tmp;
7864c1414c8SBarry Smith   aa   = a_a;
7874c1414c8SBarry Smith   aj   = a_j;
7884c1414c8SBarry Smith   ad   = a->diag;
7894c1414c8SBarry Smith 
7904c1414c8SBarry Smith   for (i = 0, row = 0; i < node_max; ++i) {
7914c1414c8SBarry Smith     nsz = ns[i];
7924c1414c8SBarry Smith     aii = ai[row];
7934c1414c8SBarry Smith     v1  = aa + aii;
7944c1414c8SBarry Smith     vi  = aj + aii;
7954c1414c8SBarry Smith     nz  = ad[row] - aii;
79626549573SJed Brown     if (i < node_max - 1) {
79726549573SJed Brown       /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
79826549573SJed Brown       * but our indexing to determine it's size could. */
79950d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
80026549573SJed Brown       /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
80150d8bf02SJed Brown       PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
80226549573SJed Brown       /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
80326549573SJed Brown     }
8044c1414c8SBarry Smith 
8054c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
8064c1414c8SBarry Smith     case 1:
8074c1414c8SBarry Smith       sum1 = b[*r++];
8084c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8094c1414c8SBarry Smith         i0 = vi[0];
8104c1414c8SBarry Smith         i1 = vi[1];
8114c1414c8SBarry Smith         vi += 2;
8124c1414c8SBarry Smith         tmp0 = tmps[i0];
8134c1414c8SBarry Smith         tmp1 = tmps[i1];
8149371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8159371c9d4SSatish Balay         v1 += 2;
8164c1414c8SBarry Smith       }
8174c1414c8SBarry Smith       if (j == nz - 1) {
8184c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8194c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8204c1414c8SBarry Smith       }
8214c1414c8SBarry Smith       tmp[row++] = sum1;
8224c1414c8SBarry Smith       break;
8234c1414c8SBarry Smith     case 2:
8244c1414c8SBarry Smith       sum1 = b[*r++];
8254c1414c8SBarry Smith       sum2 = b[*r++];
8264c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8274c1414c8SBarry Smith 
8284c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8294c1414c8SBarry Smith         i0 = vi[0];
8304c1414c8SBarry Smith         i1 = vi[1];
8314c1414c8SBarry Smith         vi += 2;
8324c1414c8SBarry Smith         tmp0 = tmps[i0];
8334c1414c8SBarry Smith         tmp1 = tmps[i1];
8349371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8359371c9d4SSatish Balay         v1 += 2;
8369371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8379371c9d4SSatish Balay         v2 += 2;
8384c1414c8SBarry Smith       }
8394c1414c8SBarry Smith       if (j == nz - 1) {
8404c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8414c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8424c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8434c1414c8SBarry Smith       }
8444c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8454c1414c8SBarry Smith       tmp[row++] = sum1;
8464c1414c8SBarry Smith       tmp[row++] = sum2;
8474c1414c8SBarry Smith       break;
8484c1414c8SBarry Smith     case 3:
8494c1414c8SBarry Smith       sum1 = b[*r++];
8504c1414c8SBarry Smith       sum2 = b[*r++];
8514c1414c8SBarry Smith       sum3 = b[*r++];
8524c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8534c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8544c1414c8SBarry Smith 
8554c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8564c1414c8SBarry Smith         i0 = vi[0];
8574c1414c8SBarry Smith         i1 = vi[1];
8584c1414c8SBarry Smith         vi += 2;
8594c1414c8SBarry Smith         tmp0 = tmps[i0];
8604c1414c8SBarry Smith         tmp1 = tmps[i1];
8619371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8629371c9d4SSatish Balay         v1 += 2;
8639371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8649371c9d4SSatish Balay         v2 += 2;
8659371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8669371c9d4SSatish Balay         v3 += 2;
8674c1414c8SBarry Smith       }
8684c1414c8SBarry Smith       if (j == nz - 1) {
8694c1414c8SBarry Smith         tmp0 = tmps[*vi++];
8704c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
8714c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
8724c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
8734c1414c8SBarry Smith       }
8744c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
8754c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
8764c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
8772205254eSKarl Rupp 
8784c1414c8SBarry Smith       tmp[row++] = sum1;
8794c1414c8SBarry Smith       tmp[row++] = sum2;
8804c1414c8SBarry Smith       tmp[row++] = sum3;
8814c1414c8SBarry Smith       break;
8824c1414c8SBarry Smith 
8834c1414c8SBarry Smith     case 4:
8844c1414c8SBarry Smith       sum1 = b[*r++];
8854c1414c8SBarry Smith       sum2 = b[*r++];
8864c1414c8SBarry Smith       sum3 = b[*r++];
8874c1414c8SBarry Smith       sum4 = b[*r++];
8884c1414c8SBarry Smith       v2   = aa + ai[row + 1];
8894c1414c8SBarry Smith       v3   = aa + ai[row + 2];
8904c1414c8SBarry Smith       v4   = aa + ai[row + 3];
8914c1414c8SBarry Smith 
8924c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
8934c1414c8SBarry Smith         i0 = vi[0];
8944c1414c8SBarry Smith         i1 = vi[1];
8954c1414c8SBarry Smith         vi += 2;
8964c1414c8SBarry Smith         tmp0 = tmps[i0];
8974c1414c8SBarry Smith         tmp1 = tmps[i1];
8989371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8999371c9d4SSatish Balay         v1 += 2;
9009371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9019371c9d4SSatish Balay         v2 += 2;
9029371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9039371c9d4SSatish Balay         v3 += 2;
9049371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9059371c9d4SSatish Balay         v4 += 2;
9064c1414c8SBarry Smith       }
9074c1414c8SBarry Smith       if (j == nz - 1) {
9084c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9094c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9104c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9114c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9124c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9134c1414c8SBarry Smith       }
9144c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9154c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9164c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9174c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9184c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9194c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9204c1414c8SBarry Smith 
9214c1414c8SBarry Smith       tmp[row++] = sum1;
9224c1414c8SBarry Smith       tmp[row++] = sum2;
9234c1414c8SBarry Smith       tmp[row++] = sum3;
9244c1414c8SBarry Smith       tmp[row++] = sum4;
9254c1414c8SBarry Smith       break;
9264c1414c8SBarry Smith     case 5:
9274c1414c8SBarry Smith       sum1 = b[*r++];
9284c1414c8SBarry Smith       sum2 = b[*r++];
9294c1414c8SBarry Smith       sum3 = b[*r++];
9304c1414c8SBarry Smith       sum4 = b[*r++];
9314c1414c8SBarry Smith       sum5 = b[*r++];
9324c1414c8SBarry Smith       v2   = aa + ai[row + 1];
9334c1414c8SBarry Smith       v3   = aa + ai[row + 2];
9344c1414c8SBarry Smith       v4   = aa + ai[row + 3];
9354c1414c8SBarry Smith       v5   = aa + ai[row + 4];
9364c1414c8SBarry Smith 
9374c1414c8SBarry Smith       for (j = 0; j < nz - 1; j += 2) {
9384c1414c8SBarry Smith         i0 = vi[0];
9394c1414c8SBarry Smith         i1 = vi[1];
9404c1414c8SBarry Smith         vi += 2;
9414c1414c8SBarry Smith         tmp0 = tmps[i0];
9424c1414c8SBarry Smith         tmp1 = tmps[i1];
9439371c9d4SSatish Balay         sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9449371c9d4SSatish Balay         v1 += 2;
9459371c9d4SSatish Balay         sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9469371c9d4SSatish Balay         v2 += 2;
9479371c9d4SSatish Balay         sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9489371c9d4SSatish Balay         v3 += 2;
9499371c9d4SSatish Balay         sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9509371c9d4SSatish Balay         v4 += 2;
9519371c9d4SSatish Balay         sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
9529371c9d4SSatish Balay         v5 += 2;
9534c1414c8SBarry Smith       }
9544c1414c8SBarry Smith       if (j == nz - 1) {
9554c1414c8SBarry Smith         tmp0 = tmps[*vi++];
9564c1414c8SBarry Smith         sum1 -= *v1++ * tmp0;
9574c1414c8SBarry Smith         sum2 -= *v2++ * tmp0;
9584c1414c8SBarry Smith         sum3 -= *v3++ * tmp0;
9594c1414c8SBarry Smith         sum4 -= *v4++ * tmp0;
9604c1414c8SBarry Smith         sum5 -= *v5++ * tmp0;
9614c1414c8SBarry Smith       }
9624c1414c8SBarry Smith 
9634c1414c8SBarry Smith       sum2 -= *v2++ * sum1;
9644c1414c8SBarry Smith       sum3 -= *v3++ * sum1;
9654c1414c8SBarry Smith       sum4 -= *v4++ * sum1;
9664c1414c8SBarry Smith       sum5 -= *v5++ * sum1;
9674c1414c8SBarry Smith       sum3 -= *v3++ * sum2;
9684c1414c8SBarry Smith       sum4 -= *v4++ * sum2;
9694c1414c8SBarry Smith       sum5 -= *v5++ * sum2;
9704c1414c8SBarry Smith       sum4 -= *v4++ * sum3;
9714c1414c8SBarry Smith       sum5 -= *v5++ * sum3;
9724c1414c8SBarry Smith       sum5 -= *v5++ * sum4;
9734c1414c8SBarry Smith 
9744c1414c8SBarry Smith       tmp[row++] = sum1;
9754c1414c8SBarry Smith       tmp[row++] = sum2;
9764c1414c8SBarry Smith       tmp[row++] = sum3;
9774c1414c8SBarry Smith       tmp[row++] = sum4;
9784c1414c8SBarry Smith       tmp[row++] = sum5;
9794c1414c8SBarry Smith       break;
980d71ae5a4SJacob Faibussowitsch     default:
981d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
9824c1414c8SBarry Smith     }
9834c1414c8SBarry Smith   }
9844c1414c8SBarry Smith   /* backward solve the upper triangular */
9854c1414c8SBarry Smith   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
9864c1414c8SBarry Smith     nsz = ns[i];
9874c1414c8SBarry Smith     aii = ai[row + 1] - 1;
9884c1414c8SBarry Smith     v1  = aa + aii;
9894c1414c8SBarry Smith     vi  = aj + aii;
9904c1414c8SBarry Smith     nz  = aii - ad[row];
9914c1414c8SBarry Smith     switch (nsz) { /* Each loop in 'case' is unrolled */
9924c1414c8SBarry Smith     case 1:
9934c1414c8SBarry Smith       sum1 = tmp[row];
9944c1414c8SBarry Smith 
9954c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
9964c1414c8SBarry Smith         vi -= 2;
9974c1414c8SBarry Smith         i0   = vi[2];
9984c1414c8SBarry Smith         i1   = vi[1];
9994c1414c8SBarry Smith         tmp0 = tmps[i0];
10004c1414c8SBarry Smith         tmp1 = tmps[i1];
10014c1414c8SBarry Smith         v1 -= 2;
10024c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10034c1414c8SBarry Smith       }
10044c1414c8SBarry Smith       if (j == 1) {
10054c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10064c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10074c1414c8SBarry Smith       }
10089371c9d4SSatish Balay       x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10099371c9d4SSatish Balay       row--;
10104c1414c8SBarry Smith       break;
10114c1414c8SBarry Smith     case 2:
10124c1414c8SBarry Smith       sum1 = tmp[row];
10134c1414c8SBarry Smith       sum2 = tmp[row - 1];
10144c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10154c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10164c1414c8SBarry Smith         vi -= 2;
10174c1414c8SBarry Smith         i0   = vi[2];
10184c1414c8SBarry Smith         i1   = vi[1];
10194c1414c8SBarry Smith         tmp0 = tmps[i0];
10204c1414c8SBarry Smith         tmp1 = tmps[i1];
10214c1414c8SBarry Smith         v1 -= 2;
10224c1414c8SBarry Smith         v2 -= 2;
10234c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10244c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10254c1414c8SBarry Smith       }
10264c1414c8SBarry Smith       if (j == 1) {
10274c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10284c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10294c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10304c1414c8SBarry Smith       }
10314c1414c8SBarry Smith 
10329371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10339371c9d4SSatish Balay       row--;
10344c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10359371c9d4SSatish Balay       x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10369371c9d4SSatish Balay       row--;
10374c1414c8SBarry Smith       break;
10384c1414c8SBarry Smith     case 3:
10394c1414c8SBarry Smith       sum1 = tmp[row];
10404c1414c8SBarry Smith       sum2 = tmp[row - 1];
10414c1414c8SBarry Smith       sum3 = tmp[row - 2];
10424c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10434c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10444c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10454c1414c8SBarry Smith         vi -= 2;
10464c1414c8SBarry Smith         i0   = vi[2];
10474c1414c8SBarry Smith         i1   = vi[1];
10484c1414c8SBarry Smith         tmp0 = tmps[i0];
10494c1414c8SBarry Smith         tmp1 = tmps[i1];
10504c1414c8SBarry Smith         v1 -= 2;
10514c1414c8SBarry Smith         v2 -= 2;
10524c1414c8SBarry Smith         v3 -= 2;
10534c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10544c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10554c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10564c1414c8SBarry Smith       }
10574c1414c8SBarry Smith       if (j == 1) {
10584c1414c8SBarry Smith         tmp0 = tmps[*vi--];
10594c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
10604c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
10614c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
10624c1414c8SBarry Smith       }
10639371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10649371c9d4SSatish Balay       row--;
10654c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
10664c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10679371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10689371c9d4SSatish Balay       row--;
10694c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
10709371c9d4SSatish Balay       x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
10719371c9d4SSatish Balay       row--;
10724c1414c8SBarry Smith 
10734c1414c8SBarry Smith       break;
10744c1414c8SBarry Smith     case 4:
10754c1414c8SBarry Smith       sum1 = tmp[row];
10764c1414c8SBarry Smith       sum2 = tmp[row - 1];
10774c1414c8SBarry Smith       sum3 = tmp[row - 2];
10784c1414c8SBarry Smith       sum4 = tmp[row - 3];
10794c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
10804c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
10814c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
10824c1414c8SBarry Smith 
10834c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
10844c1414c8SBarry Smith         vi -= 2;
10854c1414c8SBarry Smith         i0   = vi[2];
10864c1414c8SBarry Smith         i1   = vi[1];
10874c1414c8SBarry Smith         tmp0 = tmps[i0];
10884c1414c8SBarry Smith         tmp1 = tmps[i1];
10894c1414c8SBarry Smith         v1 -= 2;
10904c1414c8SBarry Smith         v2 -= 2;
10914c1414c8SBarry Smith         v3 -= 2;
10924c1414c8SBarry Smith         v4 -= 2;
10934c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10944c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10954c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10964c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
10974c1414c8SBarry Smith       }
10984c1414c8SBarry Smith       if (j == 1) {
10994c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11004c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11014c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11024c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11034c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11044c1414c8SBarry Smith       }
11054c1414c8SBarry Smith 
11069371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11079371c9d4SSatish Balay       row--;
11084c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11094c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11104c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11119371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11129371c9d4SSatish Balay       row--;
11134c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11144c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11159371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11169371c9d4SSatish Balay       row--;
11174c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11189371c9d4SSatish Balay       x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11199371c9d4SSatish Balay       row--;
11204c1414c8SBarry Smith       break;
11214c1414c8SBarry Smith     case 5:
11224c1414c8SBarry Smith       sum1 = tmp[row];
11234c1414c8SBarry Smith       sum2 = tmp[row - 1];
11244c1414c8SBarry Smith       sum3 = tmp[row - 2];
11254c1414c8SBarry Smith       sum4 = tmp[row - 3];
11264c1414c8SBarry Smith       sum5 = tmp[row - 4];
11274c1414c8SBarry Smith       v2   = aa + ai[row] - 1;
11284c1414c8SBarry Smith       v3   = aa + ai[row - 1] - 1;
11294c1414c8SBarry Smith       v4   = aa + ai[row - 2] - 1;
11304c1414c8SBarry Smith       v5   = aa + ai[row - 3] - 1;
11314c1414c8SBarry Smith       for (j = nz; j > 1; j -= 2) {
11324c1414c8SBarry Smith         vi -= 2;
11334c1414c8SBarry Smith         i0   = vi[2];
11344c1414c8SBarry Smith         i1   = vi[1];
11354c1414c8SBarry Smith         tmp0 = tmps[i0];
11364c1414c8SBarry Smith         tmp1 = tmps[i1];
11374c1414c8SBarry Smith         v1 -= 2;
11384c1414c8SBarry Smith         v2 -= 2;
11394c1414c8SBarry Smith         v3 -= 2;
11404c1414c8SBarry Smith         v4 -= 2;
11414c1414c8SBarry Smith         v5 -= 2;
11424c1414c8SBarry Smith         sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11434c1414c8SBarry Smith         sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11444c1414c8SBarry Smith         sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11454c1414c8SBarry Smith         sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11464c1414c8SBarry Smith         sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
11474c1414c8SBarry Smith       }
11484c1414c8SBarry Smith       if (j == 1) {
11494c1414c8SBarry Smith         tmp0 = tmps[*vi--];
11504c1414c8SBarry Smith         sum1 -= *v1-- * tmp0;
11514c1414c8SBarry Smith         sum2 -= *v2-- * tmp0;
11524c1414c8SBarry Smith         sum3 -= *v3-- * tmp0;
11534c1414c8SBarry Smith         sum4 -= *v4-- * tmp0;
11544c1414c8SBarry Smith         sum5 -= *v5-- * tmp0;
11554c1414c8SBarry Smith       }
11564c1414c8SBarry Smith 
11579371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11589371c9d4SSatish Balay       row--;
11594c1414c8SBarry Smith       sum2 -= *v2-- * tmp0;
11604c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11614c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11624c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11639371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11649371c9d4SSatish Balay       row--;
11654c1414c8SBarry Smith       sum3 -= *v3-- * tmp0;
11664c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11674c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11689371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11699371c9d4SSatish Balay       row--;
11704c1414c8SBarry Smith       sum4 -= *v4-- * tmp0;
11714c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11729371c9d4SSatish Balay       tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11739371c9d4SSatish Balay       row--;
11744c1414c8SBarry Smith       sum5 -= *v5-- * tmp0;
11759371c9d4SSatish Balay       x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
11769371c9d4SSatish Balay       row--;
11774c1414c8SBarry Smith       break;
1178d71ae5a4SJacob Faibussowitsch     default:
1179d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
11804c1414c8SBarry Smith     }
11814c1414c8SBarry Smith   }
11829566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
11839566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
11849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
11859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
11869566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
11874c1414c8SBarry Smith   PetscFunctionReturn(0);
11884c1414c8SBarry Smith }
11894c1414c8SBarry Smith 
1190d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1191d71ae5a4SJacob Faibussowitsch {
119228f1b45aSHong Zhang   Mat              C = B;
119328f1b45aSHong Zhang   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
119428f1b45aSHong Zhang   IS               isrow = b->row, isicol = b->icol;
119528f1b45aSHong Zhang   const PetscInt  *r, *ic, *ics;
119628f1b45aSHong Zhang   const PetscInt   n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
119728f1b45aSHong Zhang   PetscInt         i, j, k, nz, nzL, row, *pj;
119828f1b45aSHong Zhang   const PetscInt  *ajtmp, *bjtmp;
11999877982aSShri Abhyankar   MatScalar       *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
12009877982aSShri Abhyankar   const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
120128f1b45aSHong Zhang   FactorShiftCtx   sctx;
12024f81c4b7SBarry Smith   const PetscInt  *ddiag;
120328f1b45aSHong Zhang   PetscReal        rs;
120428f1b45aSHong Zhang   MatScalar        d;
12054f81c4b7SBarry Smith   PetscInt         inod, nodesz, node_max, col;
12064f81c4b7SBarry Smith   const PetscInt  *ns;
120707b50cabSHong Zhang   PetscInt        *tmp_vec1, *tmp_vec2, *nsmap;
12080e95ead3SHong Zhang 
120928f1b45aSHong Zhang   PetscFunctionBegin;
121028f1b45aSHong Zhang   /* MatPivotSetUp(): initialize shift context sctx */
12119566063dSJacob Faibussowitsch   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
121228f1b45aSHong Zhang 
1213f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
121428f1b45aSHong Zhang     ddiag          = a->diag;
121528f1b45aSHong Zhang     sctx.shift_top = info->zeropivot;
121628f1b45aSHong Zhang     for (i = 0; i < n; i++) {
121728f1b45aSHong Zhang       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
121828f1b45aSHong Zhang       d  = (aa)[ddiag[i]];
121928f1b45aSHong Zhang       rs = -PetscAbsScalar(d) - PetscRealPart(d);
122028f1b45aSHong Zhang       v  = aa + ai[i];
122128f1b45aSHong Zhang       nz = ai[i + 1] - ai[i];
12222205254eSKarl Rupp       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
122328f1b45aSHong Zhang       if (rs > sctx.shift_top) sctx.shift_top = rs;
122428f1b45aSHong Zhang     }
122528f1b45aSHong Zhang     sctx.shift_top *= 1.1;
122628f1b45aSHong Zhang     sctx.nshift_max = 5;
122728f1b45aSHong Zhang     sctx.shift_lo   = 0.;
122828f1b45aSHong Zhang     sctx.shift_hi   = 1.;
122928f1b45aSHong Zhang   }
123028f1b45aSHong Zhang 
12319566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
12329566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
123368785679SHong Zhang 
12349566063dSJacob Faibussowitsch   PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
123528f1b45aSHong Zhang   ics = ic;
123628f1b45aSHong Zhang 
123728f1b45aSHong Zhang   node_max = a->inode.node_count;
123828f1b45aSHong Zhang   ns       = a->inode.size;
123928b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
124028f1b45aSHong Zhang 
12419877982aSShri Abhyankar   /* If max inode size > 4, split it into two inodes.*/
124268785679SHong Zhang   /* also map the inode sizes according to the ordering */
12439566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
124468785679SHong Zhang   for (i = 0, j = 0; i < node_max; ++i, ++j) {
1245b1550197SShri Abhyankar     if (ns[i] > 4) {
1246048b5e81SShri Abhyankar       tmp_vec1[j] = 4;
124768785679SHong Zhang       ++j;
124868785679SHong Zhang       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
124968785679SHong Zhang     } else {
125068785679SHong Zhang       tmp_vec1[j] = ns[i];
125168785679SHong Zhang     }
125268785679SHong Zhang   }
125368785679SHong Zhang   /* Use the correct node_max */
125468785679SHong Zhang   node_max = j;
125568785679SHong Zhang 
125668785679SHong Zhang   /* Now reorder the inode info based on mat re-ordering info */
125768785679SHong Zhang   /* First create a row -> inode_size_array_index map */
12589566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &nsmap));
12599566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
126068785679SHong Zhang   for (i = 0, row = 0; i < node_max; i++) {
126168785679SHong Zhang     nodesz = tmp_vec1[i];
1262ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
126368785679SHong Zhang   }
126468785679SHong Zhang   /* Using nsmap, create a reordered ns structure */
126568785679SHong Zhang   for (i = 0, j = 0; i < node_max; i++) {
126668785679SHong Zhang     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
126768785679SHong Zhang     tmp_vec2[i] = nodesz;
126868785679SHong Zhang     j += nodesz;
126968785679SHong Zhang   }
12709566063dSJacob Faibussowitsch   PetscCall(PetscFree(nsmap));
12719566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec1));
1272b89f182dSHong Zhang 
127368785679SHong Zhang   /* Now use the correct ns */
127468785679SHong Zhang   ns = tmp_vec2;
127568785679SHong Zhang 
127628f1b45aSHong Zhang   do {
127707b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
127828f1b45aSHong Zhang     /* Now loop over each block-row, and do the factorization */
127928f1b45aSHong Zhang     for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
128028f1b45aSHong Zhang       nodesz = ns[inod];
128128f1b45aSHong Zhang 
128228f1b45aSHong Zhang       switch (nodesz) {
128328f1b45aSHong Zhang       case 1:
128468785679SHong Zhang         /*----------*/
1285b89f182dSHong Zhang         /* zero rtmp1 */
128628f1b45aSHong Zhang         /* L part */
128728f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
128828f1b45aSHong Zhang         bjtmp = bj + bi[i];
1289b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
129028f1b45aSHong Zhang 
129128f1b45aSHong Zhang         /* U part */
129228f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
129328f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
1294b89f182dSHong Zhang         for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
129528f1b45aSHong Zhang 
129628f1b45aSHong Zhang         /* load in initial (unfactored row) */
129728f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
129828f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
129928f1b45aSHong Zhang         v     = aa + ai[r[i]];
13002205254eSKarl Rupp         for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
13012205254eSKarl Rupp 
130228f1b45aSHong Zhang         /* ZeropivotApply() */
1303b89f182dSHong Zhang         rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
130428f1b45aSHong Zhang 
130528f1b45aSHong Zhang         /* elimination */
130628f1b45aSHong Zhang         bjtmp = bj + bi[i];
130728f1b45aSHong Zhang         row   = *bjtmp++;
130828f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
130928f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1310b89f182dSHong Zhang           pc = rtmp1 + row;
131128f1b45aSHong Zhang           if (*pc != 0.0) {
131228f1b45aSHong Zhang             pv   = b->a + bdiag[row];
1313b89f182dSHong Zhang             mul1 = *pc * (*pv);
1314b89f182dSHong Zhang             *pc  = mul1;
131528f1b45aSHong Zhang             pj   = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
131628f1b45aSHong Zhang             pv   = b->a + bdiag[row + 1] + 1;
131728f1b45aSHong Zhang             nz   = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1318b89f182dSHong Zhang             for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
13199566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz));
132028f1b45aSHong Zhang           }
132128f1b45aSHong Zhang           row = *bjtmp++;
132228f1b45aSHong Zhang         }
132328f1b45aSHong Zhang 
132428f1b45aSHong Zhang         /* finished row so stick it into b->a */
132528f1b45aSHong Zhang         rs = 0.0;
132628f1b45aSHong Zhang         /* L part */
132728f1b45aSHong Zhang         pv = b->a + bi[i];
132828f1b45aSHong Zhang         pj = b->j + bi[i];
132928f1b45aSHong Zhang         nz = bi[i + 1] - bi[i];
133028f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13319371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13329371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
133328f1b45aSHong Zhang         }
133428f1b45aSHong Zhang 
133528f1b45aSHong Zhang         /* U part */
133628f1b45aSHong Zhang         pv = b->a + bdiag[i + 1] + 1;
133728f1b45aSHong Zhang         pj = b->j + bdiag[i + 1] + 1;
133828f1b45aSHong Zhang         nz = bdiag[i] - bdiag[i + 1] - 1;
133928f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
13409371c9d4SSatish Balay           pv[j] = rtmp1[pj[j]];
13419371c9d4SSatish Balay           rs += PetscAbsScalar(pv[j]);
134228f1b45aSHong Zhang         }
134328f1b45aSHong Zhang 
1344b89f182dSHong Zhang         /* Check zero pivot */
134528f1b45aSHong Zhang         sctx.rs = rs;
1346b89f182dSHong Zhang         sctx.pv = rtmp1[i];
13479566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
134807b50cabSHong Zhang         if (sctx.newshift) break;
134928f1b45aSHong Zhang 
1350a5b23f4aSJose E. Roman         /* Mark diagonal and invert diagonal for simpler triangular solves */
135128f1b45aSHong Zhang         pv  = b->a + bdiag[i];
1352b89f182dSHong Zhang         *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
135328f1b45aSHong Zhang         break;
135428f1b45aSHong Zhang 
135528f1b45aSHong Zhang       case 2:
135628f1b45aSHong Zhang         /*----------*/
1357b89f182dSHong Zhang         /* zero rtmp1 and rtmp2 */
135828f1b45aSHong Zhang         /* L part */
135928f1b45aSHong Zhang         nz    = bi[i + 1] - bi[i];
136028f1b45aSHong Zhang         bjtmp = bj + bi[i];
136128f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
136268785679SHong Zhang           col        = bjtmp[j];
13639371c9d4SSatish Balay           rtmp1[col] = 0.0;
13649371c9d4SSatish Balay           rtmp2[col] = 0.0;
136528f1b45aSHong Zhang         }
136628f1b45aSHong Zhang 
136728f1b45aSHong Zhang         /* U part */
136828f1b45aSHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
136928f1b45aSHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
137028f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
137168785679SHong Zhang           col        = bjtmp[j];
13729371c9d4SSatish Balay           rtmp1[col] = 0.0;
13739371c9d4SSatish Balay           rtmp2[col] = 0.0;
137428f1b45aSHong Zhang         }
137528f1b45aSHong Zhang 
137628f1b45aSHong Zhang         /* load in initial (unfactored row) */
137728f1b45aSHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
137828f1b45aSHong Zhang         ajtmp = aj + ai[r[i]];
13799371c9d4SSatish Balay         v1    = aa + ai[r[i]];
13809371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
138128f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
138268785679SHong Zhang           col        = ics[ajtmp[j]];
13839371c9d4SSatish Balay           rtmp1[col] = v1[j];
13849371c9d4SSatish Balay           rtmp2[col] = v2[j];
138528f1b45aSHong Zhang         }
138628f1b45aSHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
13879371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
13889371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
138928f1b45aSHong Zhang 
139028f1b45aSHong Zhang         /* elimination */
139128f1b45aSHong Zhang         bjtmp = bj + bi[i];
139228f1b45aSHong Zhang         row   = *bjtmp++; /* pivot row */
139328f1b45aSHong Zhang         nzL   = bi[i + 1] - bi[i];
139428f1b45aSHong Zhang         for (k = 0; k < nzL; k++) {
1395b89f182dSHong Zhang           pc1 = rtmp1 + row;
1396b89f182dSHong Zhang           pc2 = rtmp2 + row;
139728f1b45aSHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0) {
139828f1b45aSHong Zhang             pv   = b->a + bdiag[row];
13999371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
14009371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
14019371c9d4SSatish Balay             *pc1 = mul1;
14029371c9d4SSatish Balay             *pc2 = mul2;
140328f1b45aSHong Zhang 
140428f1b45aSHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
140528f1b45aSHong Zhang             pv = b->a + bdiag[row + 1] + 1;
140628f1b45aSHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
140728f1b45aSHong Zhang             for (j = 0; j < nz; j++) {
140868785679SHong Zhang               col = pj[j];
1409b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1410b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
141128f1b45aSHong Zhang             }
14129566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz));
141328f1b45aSHong Zhang           }
141428f1b45aSHong Zhang           row = *bjtmp++;
141528f1b45aSHong Zhang         }
141628f1b45aSHong Zhang 
1417b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
141828f1b45aSHong Zhang         rs = 0.0;
141928f1b45aSHong Zhang         /* L part */
1420b89f182dSHong Zhang         pc1 = b->a + bi[i];
142128f1b45aSHong Zhang         pj  = b->j + bi[i];
142228f1b45aSHong Zhang         nz  = bi[i + 1] - bi[i];
142328f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
142468785679SHong Zhang           col    = pj[j];
14259371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14269371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
142728f1b45aSHong Zhang         }
142828f1b45aSHong Zhang         /* U part */
1429b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
143028f1b45aSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
14310e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
143228f1b45aSHong Zhang         for (j = 0; j < nz; j++) {
143368785679SHong Zhang           col    = pj[j];
14349371c9d4SSatish Balay           pc1[j] = rtmp1[col];
14359371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
143628f1b45aSHong Zhang         }
143728f1b45aSHong Zhang 
143828f1b45aSHong Zhang         sctx.rs = rs;
1439b89f182dSHong Zhang         sctx.pv = rtmp1[i];
14409566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
144107b50cabSHong Zhang         if (sctx.newshift) break;
1442b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diagonal */
1443b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1444b89f182dSHong Zhang 
1445b89f182dSHong Zhang         /* Now take care of diagonal 2x2 block. */
1446b89f182dSHong Zhang         pc2 = rtmp2 + i;
1447b89f182dSHong Zhang         if (*pc2 != 0.0) {
1448b89f182dSHong Zhang           mul1 = (*pc2) * (*pc1);             /* *pc1=diag[i] is inverted! */
1449b89f182dSHong Zhang           *pc2 = mul1;                        /* insert L entry */
1450b89f182dSHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
1451b89f182dSHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1452b89f182dSHong Zhang           for (j = 0; j < nz; j++) {
14539371c9d4SSatish Balay             col = pj[j];
14549371c9d4SSatish Balay             rtmp2[col] -= mul1 * rtmp1[col];
145528f1b45aSHong Zhang           }
14569566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
1457b89f182dSHong Zhang         }
1458b89f182dSHong Zhang 
1459b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1460b89f182dSHong Zhang         rs = 0.0;
1461b89f182dSHong Zhang         /* L part */
1462b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1463b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1464b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1465b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1466b89f182dSHong Zhang           col    = pj[j];
14679371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14689371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1469b89f182dSHong Zhang         }
1470b89f182dSHong Zhang         /* U part */
1471b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
14720e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
14730e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1474b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1475b89f182dSHong Zhang           col    = pj[j];
14769371c9d4SSatish Balay           pc2[j] = rtmp2[col];
14779371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1478b89f182dSHong Zhang         }
1479b89f182dSHong Zhang 
148028f1b45aSHong Zhang         sctx.rs = rs;
1481b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
14829566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
148307b50cabSHong Zhang         if (sctx.newshift) break;
148428f1b45aSHong Zhang         pc2  = b->a + bdiag[i + 1];
1485b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv;
148628f1b45aSHong Zhang         break;
1487b89f182dSHong Zhang 
148868785679SHong Zhang       case 3:
148968785679SHong Zhang         /*----------*/
149068785679SHong Zhang         /* zero rtmp */
149168785679SHong Zhang         /* L part */
149268785679SHong Zhang         nz    = bi[i + 1] - bi[i];
149368785679SHong Zhang         bjtmp = bj + bi[i];
149468785679SHong Zhang         for (j = 0; j < nz; j++) {
149568785679SHong Zhang           col        = bjtmp[j];
14969371c9d4SSatish Balay           rtmp1[col] = 0.0;
14979371c9d4SSatish Balay           rtmp2[col] = 0.0;
14989371c9d4SSatish Balay           rtmp3[col] = 0.0;
149968785679SHong Zhang         }
150068785679SHong Zhang 
150168785679SHong Zhang         /* U part */
150268785679SHong Zhang         nz    = bdiag[i] - bdiag[i + 1];
150368785679SHong Zhang         bjtmp = bj + bdiag[i + 1] + 1;
150468785679SHong Zhang         for (j = 0; j < nz; j++) {
150568785679SHong Zhang           col        = bjtmp[j];
15069371c9d4SSatish Balay           rtmp1[col] = 0.0;
15079371c9d4SSatish Balay           rtmp2[col] = 0.0;
15089371c9d4SSatish Balay           rtmp3[col] = 0.0;
150968785679SHong Zhang         }
151068785679SHong Zhang 
151168785679SHong Zhang         /* load in initial (unfactored row) */
151268785679SHong Zhang         nz    = ai[r[i] + 1] - ai[r[i]];
151368785679SHong Zhang         ajtmp = aj + ai[r[i]];
15149371c9d4SSatish Balay         v1    = aa + ai[r[i]];
15159371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
15169371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
151768785679SHong Zhang         for (j = 0; j < nz; j++) {
151868785679SHong Zhang           col        = ics[ajtmp[j]];
15199371c9d4SSatish Balay           rtmp1[col] = v1[j];
15209371c9d4SSatish Balay           rtmp2[col] = v2[j];
15219371c9d4SSatish Balay           rtmp3[col] = v3[j];
152268785679SHong Zhang         }
152368785679SHong Zhang         /* ZeropivotApply(): shift the diagonal of the matrix  */
15249371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
15259371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
15269371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
152768785679SHong Zhang 
152868785679SHong Zhang         /* elimination */
152968785679SHong Zhang         bjtmp = bj + bi[i];
153068785679SHong Zhang         row   = *bjtmp++; /* pivot row */
153168785679SHong Zhang         nzL   = bi[i + 1] - bi[i];
153268785679SHong Zhang         for (k = 0; k < nzL; k++) {
1533b89f182dSHong Zhang           pc1 = rtmp1 + row;
1534b89f182dSHong Zhang           pc2 = rtmp2 + row;
1535b89f182dSHong Zhang           pc3 = rtmp3 + row;
153668785679SHong Zhang           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
153768785679SHong Zhang             pv   = b->a + bdiag[row];
15389371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
15399371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
15409371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
15419371c9d4SSatish Balay             *pc1 = mul1;
15429371c9d4SSatish Balay             *pc2 = mul2;
15439371c9d4SSatish Balay             *pc3 = mul3;
154468785679SHong Zhang 
154568785679SHong Zhang             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
154668785679SHong Zhang             pv = b->a + bdiag[row + 1] + 1;
154768785679SHong Zhang             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
154868785679SHong Zhang             for (j = 0; j < nz; j++) {
154968785679SHong Zhang               col = pj[j];
1550b89f182dSHong Zhang               rtmp1[col] -= mul1 * pv[j];
1551b89f182dSHong Zhang               rtmp2[col] -= mul2 * pv[j];
1552b89f182dSHong Zhang               rtmp3[col] -= mul3 * pv[j];
155368785679SHong Zhang             }
15549566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz));
155568785679SHong Zhang           }
155668785679SHong Zhang           row = *bjtmp++;
155768785679SHong Zhang         }
155868785679SHong Zhang 
1559b89f182dSHong Zhang         /* finished row i; check zero pivot, then stick row i into b->a */
1560b89f182dSHong Zhang         rs = 0.0;
1561b89f182dSHong Zhang         /* L part */
1562b89f182dSHong Zhang         pc1 = b->a + bi[i];
1563b89f182dSHong Zhang         pj  = b->j + bi[i];
1564b89f182dSHong Zhang         nz  = bi[i + 1] - bi[i];
1565b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1566b89f182dSHong Zhang           col    = pj[j];
15679371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15689371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1569b89f182dSHong Zhang         }
1570b89f182dSHong Zhang         /* U part */
1571b89f182dSHong Zhang         pc1 = b->a + bdiag[i + 1] + 1;
1572b89f182dSHong Zhang         pj  = b->j + bdiag[i + 1] + 1;
15730e7a5c2bSHong Zhang         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1574b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1575b89f182dSHong Zhang           col    = pj[j];
15769371c9d4SSatish Balay           pc1[j] = rtmp1[col];
15779371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
1578b89f182dSHong Zhang         }
157968785679SHong Zhang 
1580b89f182dSHong Zhang         sctx.rs = rs;
1581b89f182dSHong Zhang         sctx.pv = rtmp1[i];
15829566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
158307b50cabSHong Zhang         if (sctx.newshift) break;
1584b89f182dSHong Zhang         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
1585b89f182dSHong Zhang         *pc1 = 1.0 / sctx.pv;
1586b89f182dSHong Zhang 
1587b89f182dSHong Zhang         /* Now take care of 1st column of diagonal 3x3 block. */
1588b89f182dSHong Zhang         pc2 = rtmp2 + i;
1589b89f182dSHong Zhang         pc3 = rtmp3 + i;
1590b89f182dSHong Zhang         if (*pc2 != 0.0 || *pc3 != 0.0) {
15919371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
15929371c9d4SSatish Balay           *pc2 = mul2;
15939371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
15949371c9d4SSatish Balay           *pc3 = mul3;
159568785679SHong Zhang           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
159668785679SHong Zhang           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
159768785679SHong Zhang           for (j = 0; j < nz; j++) {
159868785679SHong Zhang             col = pj[j];
1599b89f182dSHong Zhang             rtmp2[col] -= mul2 * rtmp1[col];
1600b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp1[col];
160168785679SHong Zhang           }
16029566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz));
160368785679SHong Zhang         }
160468785679SHong Zhang 
1605b89f182dSHong Zhang         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1606b89f182dSHong Zhang         rs = 0.0;
1607b89f182dSHong Zhang         /* L part */
1608b89f182dSHong Zhang         pc2 = b->a + bi[i + 1];
1609b89f182dSHong Zhang         pj  = b->j + bi[i + 1];
1610b89f182dSHong Zhang         nz  = bi[i + 2] - bi[i + 1];
1611b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1612b89f182dSHong Zhang           col    = pj[j];
16139371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16149371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1615b89f182dSHong Zhang         }
1616b89f182dSHong Zhang         /* U part */
1617b89f182dSHong Zhang         pc2 = b->a + bdiag[i + 2] + 1;
16180e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 2] + 1;
16190e7a5c2bSHong Zhang         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1620b89f182dSHong Zhang         for (j = 0; j < nz; j++) {
1621b89f182dSHong Zhang           col    = pj[j];
16229371c9d4SSatish Balay           pc2[j] = rtmp2[col];
16239371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
1624b89f182dSHong Zhang         }
1625b89f182dSHong Zhang 
1626b89f182dSHong Zhang         sctx.rs = rs;
1627b89f182dSHong Zhang         sctx.pv = rtmp2[i + 1];
16289566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
162907b50cabSHong Zhang         if (sctx.newshift) break;
1630b89f182dSHong Zhang         pc2  = b->a + bdiag[i + 1];
1631b89f182dSHong Zhang         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1632b89f182dSHong Zhang 
1633b89f182dSHong Zhang         /* Now take care of 2nd column of diagonal 3x3 block. */
1634b89f182dSHong Zhang         pc3 = rtmp3 + i + 1;
163568785679SHong Zhang         if (*pc3 != 0.0) {
16369371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
16379371c9d4SSatish Balay           *pc3 = mul3;
163868785679SHong Zhang           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
163968785679SHong Zhang           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
164068785679SHong Zhang           for (j = 0; j < nz; j++) {
164168785679SHong Zhang             col = pj[j];
1642b89f182dSHong Zhang             rtmp3[col] -= mul3 * rtmp2[col];
164368785679SHong Zhang           }
16449566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
164568785679SHong Zhang         }
164668785679SHong Zhang 
1647b89f182dSHong Zhang         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
164868785679SHong Zhang         rs = 0.0;
164968785679SHong Zhang         /* L part */
1650b89f182dSHong Zhang         pc3 = b->a + bi[i + 2];
1651b89f182dSHong Zhang         pj  = b->j + bi[i + 2];
1652b89f182dSHong Zhang         nz  = bi[i + 3] - bi[i + 2];
165368785679SHong Zhang         for (j = 0; j < nz; j++) {
165468785679SHong Zhang           col    = pj[j];
16559371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16569371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
165768785679SHong Zhang         }
165868785679SHong Zhang         /* U part */
1659b89f182dSHong Zhang         pc3 = b->a + bdiag[i + 3] + 1;
16600e7a5c2bSHong Zhang         pj  = b->j + bdiag[i + 3] + 1;
16610e7a5c2bSHong Zhang         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
166268785679SHong Zhang         for (j = 0; j < nz; j++) {
166368785679SHong Zhang           col    = pj[j];
16649371c9d4SSatish Balay           pc3[j] = rtmp3[col];
16659371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
166668785679SHong Zhang         }
166768785679SHong Zhang 
166868785679SHong Zhang         sctx.rs = rs;
1669b89f182dSHong Zhang         sctx.pv = rtmp3[i + 2];
16709566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
167107b50cabSHong Zhang         if (sctx.newshift) break;
167268785679SHong Zhang         pc3  = b->a + bdiag[i + 2];
1673b89f182dSHong Zhang         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
167468785679SHong Zhang         break;
16759877982aSShri Abhyankar       case 4:
16769877982aSShri Abhyankar         /*----------*/
16779877982aSShri Abhyankar         /* zero rtmp */
16789877982aSShri Abhyankar         /* L part */
16799877982aSShri Abhyankar         nz    = bi[i + 1] - bi[i];
16809877982aSShri Abhyankar         bjtmp = bj + bi[i];
16819877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16829877982aSShri Abhyankar           col        = bjtmp[j];
16839371c9d4SSatish Balay           rtmp1[col] = 0.0;
16849371c9d4SSatish Balay           rtmp2[col] = 0.0;
16859371c9d4SSatish Balay           rtmp3[col] = 0.0;
16869371c9d4SSatish Balay           rtmp4[col] = 0.0;
16879877982aSShri Abhyankar         }
16889877982aSShri Abhyankar 
16899877982aSShri Abhyankar         /* U part */
16909877982aSShri Abhyankar         nz    = bdiag[i] - bdiag[i + 1];
16919877982aSShri Abhyankar         bjtmp = bj + bdiag[i + 1] + 1;
16929877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
16939877982aSShri Abhyankar           col        = bjtmp[j];
16949371c9d4SSatish Balay           rtmp1[col] = 0.0;
16959371c9d4SSatish Balay           rtmp2[col] = 0.0;
16969371c9d4SSatish Balay           rtmp3[col] = 0.0;
16979371c9d4SSatish Balay           rtmp4[col] = 0.0;
16989877982aSShri Abhyankar         }
16999877982aSShri Abhyankar 
17009877982aSShri Abhyankar         /* load in initial (unfactored row) */
17019877982aSShri Abhyankar         nz    = ai[r[i] + 1] - ai[r[i]];
17029877982aSShri Abhyankar         ajtmp = aj + ai[r[i]];
17039371c9d4SSatish Balay         v1    = aa + ai[r[i]];
17049371c9d4SSatish Balay         v2    = aa + ai[r[i] + 1];
17059371c9d4SSatish Balay         v3    = aa + ai[r[i] + 2];
17069371c9d4SSatish Balay         v4    = aa + ai[r[i] + 3];
17079877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17089877982aSShri Abhyankar           col        = ics[ajtmp[j]];
17099371c9d4SSatish Balay           rtmp1[col] = v1[j];
17109371c9d4SSatish Balay           rtmp2[col] = v2[j];
17119371c9d4SSatish Balay           rtmp3[col] = v3[j];
17129371c9d4SSatish Balay           rtmp4[col] = v4[j];
17139877982aSShri Abhyankar         }
17149877982aSShri Abhyankar         /* ZeropivotApply(): shift the diagonal of the matrix  */
17159371c9d4SSatish Balay         rtmp1[i] += sctx.shift_amount;
17169371c9d4SSatish Balay         rtmp2[i + 1] += sctx.shift_amount;
17179371c9d4SSatish Balay         rtmp3[i + 2] += sctx.shift_amount;
17189371c9d4SSatish Balay         rtmp4[i + 3] += sctx.shift_amount;
17199877982aSShri Abhyankar 
17209877982aSShri Abhyankar         /* elimination */
17219877982aSShri Abhyankar         bjtmp = bj + bi[i];
17229877982aSShri Abhyankar         row   = *bjtmp++; /* pivot row */
17239877982aSShri Abhyankar         nzL   = bi[i + 1] - bi[i];
17249877982aSShri Abhyankar         for (k = 0; k < nzL; k++) {
17259877982aSShri Abhyankar           pc1 = rtmp1 + row;
17269877982aSShri Abhyankar           pc2 = rtmp2 + row;
17279877982aSShri Abhyankar           pc3 = rtmp3 + row;
17289877982aSShri Abhyankar           pc4 = rtmp4 + row;
17299877982aSShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17309877982aSShri Abhyankar             pv   = b->a + bdiag[row];
17319371c9d4SSatish Balay             mul1 = *pc1 * (*pv);
17329371c9d4SSatish Balay             mul2 = *pc2 * (*pv);
17339371c9d4SSatish Balay             mul3 = *pc3 * (*pv);
17349371c9d4SSatish Balay             mul4 = *pc4 * (*pv);
17359371c9d4SSatish Balay             *pc1 = mul1;
17369371c9d4SSatish Balay             *pc2 = mul2;
17379371c9d4SSatish Balay             *pc3 = mul3;
17389371c9d4SSatish Balay             *pc4 = mul4;
17399877982aSShri Abhyankar 
17409877982aSShri Abhyankar             pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
17419877982aSShri Abhyankar             pv = b->a + bdiag[row + 1] + 1;
17429877982aSShri Abhyankar             nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
17439877982aSShri Abhyankar             for (j = 0; j < nz; j++) {
17449877982aSShri Abhyankar               col = pj[j];
17459877982aSShri Abhyankar               rtmp1[col] -= mul1 * pv[j];
17469877982aSShri Abhyankar               rtmp2[col] -= mul2 * pv[j];
17479877982aSShri Abhyankar               rtmp3[col] -= mul3 * pv[j];
17489877982aSShri Abhyankar               rtmp4[col] -= mul4 * pv[j];
17499877982aSShri Abhyankar             }
17509566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(4 + 8.0 * nz));
17519877982aSShri Abhyankar           }
17529877982aSShri Abhyankar           row = *bjtmp++;
17539877982aSShri Abhyankar         }
17549877982aSShri Abhyankar 
17559877982aSShri Abhyankar         /* finished row i; check zero pivot, then stick row i into b->a */
17569877982aSShri Abhyankar         rs = 0.0;
17579877982aSShri Abhyankar         /* L part */
17589877982aSShri Abhyankar         pc1 = b->a + bi[i];
17599877982aSShri Abhyankar         pj  = b->j + bi[i];
17609877982aSShri Abhyankar         nz  = bi[i + 1] - bi[i];
17619877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17629877982aSShri Abhyankar           col    = pj[j];
17639371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17649371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17659877982aSShri Abhyankar         }
17669877982aSShri Abhyankar         /* U part */
17679877982aSShri Abhyankar         pc1 = b->a + bdiag[i + 1] + 1;
17689877982aSShri Abhyankar         pj  = b->j + bdiag[i + 1] + 1;
17699877982aSShri Abhyankar         nz  = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
17709877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
17719877982aSShri Abhyankar           col    = pj[j];
17729371c9d4SSatish Balay           pc1[j] = rtmp1[col];
17739371c9d4SSatish Balay           rs += PetscAbsScalar(pc1[j]);
17749877982aSShri Abhyankar         }
17759877982aSShri Abhyankar 
17769877982aSShri Abhyankar         sctx.rs = rs;
17779877982aSShri Abhyankar         sctx.pv = rtmp1[i];
17789566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i));
177907b50cabSHong Zhang         if (sctx.newshift) break;
17809877982aSShri Abhyankar         pc1  = b->a + bdiag[i]; /* Mark diag[i] */
17819877982aSShri Abhyankar         *pc1 = 1.0 / sctx.pv;
17829877982aSShri Abhyankar 
17839877982aSShri Abhyankar         /* Now take care of 1st column of diagonal 4x4 block. */
17849877982aSShri Abhyankar         pc2 = rtmp2 + i;
17859877982aSShri Abhyankar         pc3 = rtmp3 + i;
17869877982aSShri Abhyankar         pc4 = rtmp4 + i;
17879877982aSShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17889371c9d4SSatish Balay           mul2 = (*pc2) * (*pc1);
17899371c9d4SSatish Balay           *pc2 = mul2;
17909371c9d4SSatish Balay           mul3 = (*pc3) * (*pc1);
17919371c9d4SSatish Balay           *pc3 = mul3;
17929371c9d4SSatish Balay           mul4 = (*pc4) * (*pc1);
17939371c9d4SSatish Balay           *pc4 = mul4;
17949877982aSShri Abhyankar           pj   = b->j + bdiag[i + 1] + 1;     /* beginning of U(i,:) */
17959877982aSShri Abhyankar           nz   = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
17969877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
17979877982aSShri Abhyankar             col = pj[j];
17989877982aSShri Abhyankar             rtmp2[col] -= mul2 * rtmp1[col];
17999877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp1[col];
18009877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp1[col];
18019877982aSShri Abhyankar           }
18029566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(3 + 6.0 * nz));
18039877982aSShri Abhyankar         }
18049877982aSShri Abhyankar 
18059877982aSShri Abhyankar         /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
18069877982aSShri Abhyankar         rs = 0.0;
18079877982aSShri Abhyankar         /* L part */
18089877982aSShri Abhyankar         pc2 = b->a + bi[i + 1];
18099877982aSShri Abhyankar         pj  = b->j + bi[i + 1];
18109877982aSShri Abhyankar         nz  = bi[i + 2] - bi[i + 1];
18119877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18129877982aSShri Abhyankar           col    = pj[j];
18139371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18149371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18159877982aSShri Abhyankar         }
18169877982aSShri Abhyankar         /* U part */
18179877982aSShri Abhyankar         pc2 = b->a + bdiag[i + 2] + 1;
18189877982aSShri Abhyankar         pj  = b->j + bdiag[i + 2] + 1;
18199877982aSShri Abhyankar         nz  = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
18209877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18219877982aSShri Abhyankar           col    = pj[j];
18229371c9d4SSatish Balay           pc2[j] = rtmp2[col];
18239371c9d4SSatish Balay           rs += PetscAbsScalar(pc2[j]);
18249877982aSShri Abhyankar         }
18259877982aSShri Abhyankar 
18269877982aSShri Abhyankar         sctx.rs = rs;
18279877982aSShri Abhyankar         sctx.pv = rtmp2[i + 1];
18289566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
182907b50cabSHong Zhang         if (sctx.newshift) break;
18309877982aSShri Abhyankar         pc2  = b->a + bdiag[i + 1];
18319877982aSShri Abhyankar         *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
18329877982aSShri Abhyankar 
18339877982aSShri Abhyankar         /* Now take care of 2nd column of diagonal 4x4 block. */
18349877982aSShri Abhyankar         pc3 = rtmp3 + i + 1;
18359877982aSShri Abhyankar         pc4 = rtmp4 + i + 1;
18369877982aSShri Abhyankar         if (*pc3 != 0.0 || *pc4 != 0.0) {
18379371c9d4SSatish Balay           mul3 = (*pc3) * (*pc2);
18389371c9d4SSatish Balay           *pc3 = mul3;
18399371c9d4SSatish Balay           mul4 = (*pc4) * (*pc2);
18409371c9d4SSatish Balay           *pc4 = mul4;
18419877982aSShri Abhyankar           pj   = b->j + bdiag[i + 2] + 1;         /* beginning of U(i+1,:) */
18429877982aSShri Abhyankar           nz   = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
18439877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18449877982aSShri Abhyankar             col = pj[j];
18459877982aSShri Abhyankar             rtmp3[col] -= mul3 * rtmp2[col];
18469877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp2[col];
18479877982aSShri Abhyankar           }
18489566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(4.0 * nz));
18499877982aSShri Abhyankar         }
18509877982aSShri Abhyankar 
18519877982aSShri Abhyankar         /* finished i+2; check zero pivot, then stick row i+2 into b->a */
18529877982aSShri Abhyankar         rs = 0.0;
18539877982aSShri Abhyankar         /* L part */
18549877982aSShri Abhyankar         pc3 = b->a + bi[i + 2];
18559877982aSShri Abhyankar         pj  = b->j + bi[i + 2];
18569877982aSShri Abhyankar         nz  = bi[i + 3] - bi[i + 2];
18579877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18589877982aSShri Abhyankar           col    = pj[j];
18599371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18609371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18619877982aSShri Abhyankar         }
18629877982aSShri Abhyankar         /* U part */
18639877982aSShri Abhyankar         pc3 = b->a + bdiag[i + 3] + 1;
18649877982aSShri Abhyankar         pj  = b->j + bdiag[i + 3] + 1;
18659877982aSShri Abhyankar         nz  = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
18669877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
18679877982aSShri Abhyankar           col    = pj[j];
18689371c9d4SSatish Balay           pc3[j] = rtmp3[col];
18699371c9d4SSatish Balay           rs += PetscAbsScalar(pc3[j]);
18709877982aSShri Abhyankar         }
18719877982aSShri Abhyankar 
18729877982aSShri Abhyankar         sctx.rs = rs;
18739877982aSShri Abhyankar         sctx.pv = rtmp3[i + 2];
18749566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
187507b50cabSHong Zhang         if (sctx.newshift) break;
18769877982aSShri Abhyankar         pc3  = b->a + bdiag[i + 2];
18779877982aSShri Abhyankar         *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
18789877982aSShri Abhyankar 
18799877982aSShri Abhyankar         /* Now take care of 3rd column of diagonal 4x4 block. */
18809877982aSShri Abhyankar         pc4 = rtmp4 + i + 2;
18819877982aSShri Abhyankar         if (*pc4 != 0.0) {
18829371c9d4SSatish Balay           mul4 = (*pc4) * (*pc3);
18839371c9d4SSatish Balay           *pc4 = mul4;
18849877982aSShri Abhyankar           pj   = b->j + bdiag[i + 3] + 1;         /* beginning of U(i+2,:) */
18859877982aSShri Abhyankar           nz   = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
18869877982aSShri Abhyankar           for (j = 0; j < nz; j++) {
18879877982aSShri Abhyankar             col = pj[j];
18889877982aSShri Abhyankar             rtmp4[col] -= mul4 * rtmp3[col];
18899877982aSShri Abhyankar           }
18909566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz));
18919877982aSShri Abhyankar         }
18929877982aSShri Abhyankar 
18939877982aSShri Abhyankar         /* finished i+3; check zero pivot, then stick row i+3 into b->a */
18949877982aSShri Abhyankar         rs = 0.0;
18959877982aSShri Abhyankar         /* L part */
18969877982aSShri Abhyankar         pc4 = b->a + bi[i + 3];
18979877982aSShri Abhyankar         pj  = b->j + bi[i + 3];
18989877982aSShri Abhyankar         nz  = bi[i + 4] - bi[i + 3];
18999877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19009877982aSShri Abhyankar           col    = pj[j];
19019371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19029371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19039877982aSShri Abhyankar         }
19049877982aSShri Abhyankar         /* U part */
19059877982aSShri Abhyankar         pc4 = b->a + bdiag[i + 4] + 1;
19069877982aSShri Abhyankar         pj  = b->j + bdiag[i + 4] + 1;
19079877982aSShri Abhyankar         nz  = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
19089877982aSShri Abhyankar         for (j = 0; j < nz; j++) {
19099877982aSShri Abhyankar           col    = pj[j];
19109371c9d4SSatish Balay           pc4[j] = rtmp4[col];
19119371c9d4SSatish Balay           rs += PetscAbsScalar(pc4[j]);
19129877982aSShri Abhyankar         }
19139877982aSShri Abhyankar 
19149877982aSShri Abhyankar         sctx.rs = rs;
19159877982aSShri Abhyankar         sctx.pv = rtmp4[i + 3];
19169566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
191707b50cabSHong Zhang         if (sctx.newshift) break;
19189877982aSShri Abhyankar         pc4  = b->a + bdiag[i + 3];
19199877982aSShri Abhyankar         *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
19209877982aSShri Abhyankar         break;
192168785679SHong Zhang 
1922d71ae5a4SJacob Faibussowitsch       default:
1923d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
192428f1b45aSHong Zhang       }
1925c2b86aeeSHong Zhang       if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
192628f1b45aSHong Zhang       i += nodesz;              /* Update the row */
192768785679SHong Zhang     }
192828f1b45aSHong Zhang 
192928f1b45aSHong Zhang     /* MatPivotRefine() */
193007b50cabSHong Zhang     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
193128f1b45aSHong Zhang       /*
193228f1b45aSHong Zhang        * if no shift in this attempt & shifting & started shifting & can refine,
193328f1b45aSHong Zhang        * then try lower shift
193428f1b45aSHong Zhang        */
193528f1b45aSHong Zhang       sctx.shift_hi       = sctx.shift_fraction;
193628f1b45aSHong Zhang       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
193728f1b45aSHong Zhang       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
193807b50cabSHong Zhang       sctx.newshift       = PETSC_TRUE;
193928f1b45aSHong Zhang       sctx.nshift++;
194028f1b45aSHong Zhang     }
194107b50cabSHong Zhang   } while (sctx.newshift);
194228f1b45aSHong Zhang 
19439566063dSJacob Faibussowitsch   PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
19449566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
19459566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
19469566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
194728f1b45aSHong Zhang 
1948abb87a52SBarry Smith   if (b->inode.size) {
1949abb87a52SBarry Smith     C->ops->solve = MatSolve_SeqAIJ_Inode;
1950abb87a52SBarry Smith   } else {
1951d3ac4fa3SBarry Smith     C->ops->solve = MatSolve_SeqAIJ;
1952abb87a52SBarry Smith   }
195328f1b45aSHong Zhang   C->ops->solveadd          = MatSolveAdd_SeqAIJ;
195428f1b45aSHong Zhang   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ;
195528f1b45aSHong Zhang   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
195628f1b45aSHong Zhang   C->ops->matsolve          = MatMatSolve_SeqAIJ;
195728f1b45aSHong Zhang   C->assembled              = PETSC_TRUE;
195828f1b45aSHong Zhang   C->preallocated           = PETSC_TRUE;
19592205254eSKarl Rupp 
19609566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
196128f1b45aSHong Zhang 
196228f1b45aSHong Zhang   /* MatShiftView(A,info,&sctx) */
196328f1b45aSHong Zhang   if (sctx.nshift) {
1964f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
19659566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1966f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
19679566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1968f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
19699566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
197028f1b45aSHong Zhang     }
197128f1b45aSHong Zhang   }
197228f1b45aSHong Zhang   PetscFunctionReturn(0);
197328f1b45aSHong Zhang }
1974628f99d7SShri Abhyankar 
1975d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info)
1976d71ae5a4SJacob Faibussowitsch {
1977628f99d7SShri Abhyankar   Mat              C = B;
1978628f99d7SShri Abhyankar   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
1979628f99d7SShri Abhyankar   IS               iscol = b->col, isrow = b->row, isicol = b->icol;
1980628f99d7SShri Abhyankar   const PetscInt  *r, *ic, *c, *ics;
1981628f99d7SShri Abhyankar   PetscInt         n = A->rmap->n, *bi = b->i;
1982628f99d7SShri Abhyankar   PetscInt        *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow;
19838758e1faSBarry Smith   PetscInt         i, j, idx, *bd = b->diag, node_max, nodesz;
19848758e1faSBarry Smith   PetscInt        *ai = a->i, *aj = a->j;
1985628f99d7SShri Abhyankar   PetscInt        *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj;
1986628f99d7SShri Abhyankar   PetscScalar      mul1, mul2, mul3, tmp;
1987628f99d7SShri Abhyankar   MatScalar       *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33;
1988628f99d7SShri Abhyankar   const MatScalar *v1, *v2, *v3, *aa    = a->a, *rtmp1;
1989628f99d7SShri Abhyankar   PetscReal        rs = 0.0;
1990628f99d7SShri Abhyankar   FactorShiftCtx   sctx;
1991628f99d7SShri Abhyankar 
1992628f99d7SShri Abhyankar   PetscFunctionBegin;
1993628f99d7SShri Abhyankar   sctx.shift_top      = 0;
1994628f99d7SShri Abhyankar   sctx.nshift_max     = 0;
1995628f99d7SShri Abhyankar   sctx.shift_lo       = 0;
1996628f99d7SShri Abhyankar   sctx.shift_hi       = 0;
1997628f99d7SShri Abhyankar   sctx.shift_fraction = 0;
1998628f99d7SShri Abhyankar 
1999628f99d7SShri Abhyankar   /* if both shift schemes are chosen by user, only use info->shiftpd */
2000f4db908eSBarry Smith   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
2001628f99d7SShri Abhyankar     sctx.shift_top = 0;
2002628f99d7SShri Abhyankar     for (i = 0; i < n; i++) {
2003628f99d7SShri Abhyankar       /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */
2004628f99d7SShri Abhyankar       rs    = 0.0;
2005628f99d7SShri Abhyankar       ajtmp = aj + ai[i];
2006628f99d7SShri Abhyankar       rtmp1 = aa + ai[i];
2007628f99d7SShri Abhyankar       nz    = ai[i + 1] - ai[i];
2008628f99d7SShri Abhyankar       for (j = 0; j < nz; j++) {
2009628f99d7SShri Abhyankar         if (*ajtmp != i) {
2010628f99d7SShri Abhyankar           rs += PetscAbsScalar(*rtmp1++);
2011628f99d7SShri Abhyankar         } else {
2012628f99d7SShri Abhyankar           rs -= PetscRealPart(*rtmp1++);
2013628f99d7SShri Abhyankar         }
2014628f99d7SShri Abhyankar         ajtmp++;
2015628f99d7SShri Abhyankar       }
2016628f99d7SShri Abhyankar       if (rs > sctx.shift_top) sctx.shift_top = rs;
2017628f99d7SShri Abhyankar     }
2018628f99d7SShri Abhyankar     if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12;
2019628f99d7SShri Abhyankar     sctx.shift_top *= 1.1;
2020628f99d7SShri Abhyankar     sctx.nshift_max = 5;
2021628f99d7SShri Abhyankar     sctx.shift_lo   = 0.;
2022628f99d7SShri Abhyankar     sctx.shift_hi   = 1.;
2023628f99d7SShri Abhyankar   }
2024628f99d7SShri Abhyankar   sctx.shift_amount = 0;
2025628f99d7SShri Abhyankar   sctx.nshift       = 0;
2026628f99d7SShri Abhyankar 
20279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
20289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol, &c));
20299566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
20309566063dSJacob Faibussowitsch   PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33));
2031628f99d7SShri Abhyankar   ics = ic;
2032628f99d7SShri Abhyankar 
2033628f99d7SShri Abhyankar   node_max = a->inode.node_count;
2034628f99d7SShri Abhyankar   ns       = a->inode.size;
203528b400f6SJacob Faibussowitsch   PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
2036628f99d7SShri Abhyankar 
2037628f99d7SShri Abhyankar   /* If max inode size > 3, split it into two inodes.*/
2038628f99d7SShri Abhyankar   /* also map the inode sizes according to the ordering */
20399566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
2040628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; ++i, ++j) {
2041628f99d7SShri Abhyankar     if (ns[i] > 3) {
2042628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5  */
2043628f99d7SShri Abhyankar       ++j;
2044628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i] - tmp_vec1[j - 1];
2045628f99d7SShri Abhyankar     } else {
2046628f99d7SShri Abhyankar       tmp_vec1[j] = ns[i];
2047628f99d7SShri Abhyankar     }
2048628f99d7SShri Abhyankar   }
2049628f99d7SShri Abhyankar   /* Use the correct node_max */
2050628f99d7SShri Abhyankar   node_max = j;
2051628f99d7SShri Abhyankar 
2052628f99d7SShri Abhyankar   /* Now reorder the inode info based on mat re-ordering info */
2053628f99d7SShri Abhyankar   /* First create a row -> inode_size_array_index map */
20549566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2));
2055628f99d7SShri Abhyankar   for (i = 0, row = 0; i < node_max; i++) {
2056628f99d7SShri Abhyankar     nodesz = tmp_vec1[i];
2057ad540459SPierre Jolivet     for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
2058628f99d7SShri Abhyankar   }
2059628f99d7SShri Abhyankar   /* Using nsmap, create a reordered ns structure */
2060628f99d7SShri Abhyankar   for (i = 0, j = 0; i < node_max; i++) {
2061628f99d7SShri Abhyankar     nodesz      = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
2062628f99d7SShri Abhyankar     tmp_vec2[i] = nodesz;
2063628f99d7SShri Abhyankar     j += nodesz;
2064628f99d7SShri Abhyankar   }
20659566063dSJacob Faibussowitsch   PetscCall(PetscFree2(nsmap, tmp_vec1));
2066628f99d7SShri Abhyankar   /* Now use the correct ns */
2067628f99d7SShri Abhyankar   ns = tmp_vec2;
2068628f99d7SShri Abhyankar 
2069628f99d7SShri Abhyankar   do {
207007b50cabSHong Zhang     sctx.newshift = PETSC_FALSE;
2071628f99d7SShri Abhyankar     /* Now loop over each block-row, and do the factorization */
2072628f99d7SShri Abhyankar     for (i = 0, row = 0; i < node_max; i++) {
2073628f99d7SShri Abhyankar       nodesz = ns[i];
2074628f99d7SShri Abhyankar       nz     = bi[row + 1] - bi[row];
2075628f99d7SShri Abhyankar       bjtmp  = bj + bi[row];
2076628f99d7SShri Abhyankar 
2077628f99d7SShri Abhyankar       switch (nodesz) {
2078628f99d7SShri Abhyankar       case 1:
2079628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2080628f99d7SShri Abhyankar           idx         = bjtmp[j];
2081628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2082628f99d7SShri Abhyankar         }
2083628f99d7SShri Abhyankar 
2084628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2085628f99d7SShri Abhyankar         idx    = r[row];
2086628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2087628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2088628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2089628f99d7SShri Abhyankar 
2090628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2091628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2092628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2093628f99d7SShri Abhyankar         }
2094628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2095628f99d7SShri Abhyankar 
2096628f99d7SShri Abhyankar         prow = *bjtmp++;
2097628f99d7SShri Abhyankar         while (prow < row) {
2098628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2099628f99d7SShri Abhyankar           if (*pc1 != 0.0) {
2100628f99d7SShri Abhyankar             pv     = ba + bd[prow];
2101628f99d7SShri Abhyankar             pj     = nbj + bd[prow];
2102628f99d7SShri Abhyankar             mul1   = *pc1 * *pv++;
2103628f99d7SShri Abhyankar             *pc1   = mul1;
2104628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
21059566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2106628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2107628f99d7SShri Abhyankar               tmp = pv[j];
2108628f99d7SShri Abhyankar               idx = pj[j];
2109628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2110628f99d7SShri Abhyankar             }
2111628f99d7SShri Abhyankar           }
2112628f99d7SShri Abhyankar           prow = *bjtmp++;
2113628f99d7SShri Abhyankar         }
2114628f99d7SShri Abhyankar         pj  = bj + bi[row];
2115628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2116628f99d7SShri Abhyankar 
2117628f99d7SShri Abhyankar         sctx.pv     = rtmp11[row];
2118628f99d7SShri Abhyankar         rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */
2119628f99d7SShri Abhyankar         rs          = 0.0;
2120628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2121628f99d7SShri Abhyankar           idx    = pj[j];
2122628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */
2123628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(pc1[j]);
2124628f99d7SShri Abhyankar         }
2125628f99d7SShri Abhyankar         sctx.rs = rs;
21269566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
212707b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2128628f99d7SShri Abhyankar         break;
2129628f99d7SShri Abhyankar 
2130628f99d7SShri Abhyankar       case 2:
2131628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2132628f99d7SShri Abhyankar           idx         = bjtmp[j];
2133628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2134628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2135628f99d7SShri Abhyankar         }
2136628f99d7SShri Abhyankar 
2137628f99d7SShri Abhyankar         /* load in initial (unfactored row) */
2138628f99d7SShri Abhyankar         idx    = r[row];
2139628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2140628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2141628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2142628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2143628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2144628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2145628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2146628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2147628f99d7SShri Abhyankar         }
2148628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2149628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2150628f99d7SShri Abhyankar 
2151628f99d7SShri Abhyankar         prow = *bjtmp++;
2152628f99d7SShri Abhyankar         while (prow < row) {
2153628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2154628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2155628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0) {
2156628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2157628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2158628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2159628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2160628f99d7SShri Abhyankar             ++pv;
2161628f99d7SShri Abhyankar             *pc1 = mul1;
2162628f99d7SShri Abhyankar             *pc2 = mul2;
2163628f99d7SShri Abhyankar 
2164628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2165628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2166628f99d7SShri Abhyankar               tmp = pv[j];
2167628f99d7SShri Abhyankar               idx = pj[j];
2168628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2169628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2170628f99d7SShri Abhyankar             }
21719566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2172628f99d7SShri Abhyankar           }
2173628f99d7SShri Abhyankar           prow = *bjtmp++;
2174628f99d7SShri Abhyankar         }
2175628f99d7SShri Abhyankar 
2176628f99d7SShri Abhyankar         /* Now take care of diagonal 2x2 block. Note: prow = row here */
2177628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2178628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2179628f99d7SShri Abhyankar 
2180628f99d7SShri Abhyankar         sctx.pv = *pc1;
2181628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2182628f99d7SShri Abhyankar         rs      = 0.0;
2183628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2184628f99d7SShri Abhyankar           idx = pj[j];
2185628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]);
2186628f99d7SShri Abhyankar         }
2187628f99d7SShri Abhyankar         sctx.rs = rs;
21889566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
218907b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2190628f99d7SShri Abhyankar 
2191628f99d7SShri Abhyankar         if (*pc2 != 0.0) {
2192628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2193628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1); /* since diag is not yet inverted.*/
2194628f99d7SShri Abhyankar           *pc2   = mul2;
2195628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2196628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2197628f99d7SShri Abhyankar             idx = pj[j];
2198628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2199628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2200628f99d7SShri Abhyankar           }
22019566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2202628f99d7SShri Abhyankar         }
2203628f99d7SShri Abhyankar 
2204628f99d7SShri Abhyankar         pj  = bj + bi[row];
2205628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2206628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2207628f99d7SShri Abhyankar 
2208628f99d7SShri Abhyankar         sctx.pv         = rtmp22[row + 1];
2209628f99d7SShri Abhyankar         rs              = 0.0;
2210628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2211628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2212628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2213628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2214628f99d7SShri Abhyankar           idx    = pj[j];
2215628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2216628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2217628f99d7SShri Abhyankar           if (idx != row + 1) rs += PetscAbsScalar(pc2[j]);
2218628f99d7SShri Abhyankar         }
2219628f99d7SShri Abhyankar         sctx.rs = rs;
22209566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
222107b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2222628f99d7SShri Abhyankar         break;
2223628f99d7SShri Abhyankar 
2224628f99d7SShri Abhyankar       case 3:
2225628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2226628f99d7SShri Abhyankar           idx         = bjtmp[j];
2227628f99d7SShri Abhyankar           rtmp11[idx] = 0.0;
2228628f99d7SShri Abhyankar           rtmp22[idx] = 0.0;
2229628f99d7SShri Abhyankar           rtmp33[idx] = 0.0;
2230628f99d7SShri Abhyankar         }
2231628f99d7SShri Abhyankar         /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */
2232628f99d7SShri Abhyankar         idx    = r[row];
2233628f99d7SShri Abhyankar         nz_tmp = ai[idx + 1] - ai[idx];
2234628f99d7SShri Abhyankar         ajtmp  = aj + ai[idx];
2235628f99d7SShri Abhyankar         v1     = aa + ai[idx];
2236628f99d7SShri Abhyankar         v2     = aa + ai[idx + 1];
2237628f99d7SShri Abhyankar         v3     = aa + ai[idx + 2];
2238628f99d7SShri Abhyankar         for (j = 0; j < nz_tmp; j++) {
2239628f99d7SShri Abhyankar           idx         = ics[ajtmp[j]];
2240628f99d7SShri Abhyankar           rtmp11[idx] = v1[j];
2241628f99d7SShri Abhyankar           rtmp22[idx] = v2[j];
2242628f99d7SShri Abhyankar           rtmp33[idx] = v3[j];
2243628f99d7SShri Abhyankar         }
2244628f99d7SShri Abhyankar         rtmp11[ics[r[row]]] += sctx.shift_amount;
2245628f99d7SShri Abhyankar         rtmp22[ics[r[row + 1]]] += sctx.shift_amount;
2246628f99d7SShri Abhyankar         rtmp33[ics[r[row + 2]]] += sctx.shift_amount;
2247628f99d7SShri Abhyankar 
2248628f99d7SShri Abhyankar         /* loop over all pivot row blocks above this row block */
2249628f99d7SShri Abhyankar         prow = *bjtmp++;
2250628f99d7SShri Abhyankar         while (prow < row) {
2251628f99d7SShri Abhyankar           pc1 = rtmp11 + prow;
2252628f99d7SShri Abhyankar           pc2 = rtmp22 + prow;
2253628f99d7SShri Abhyankar           pc3 = rtmp33 + prow;
2254628f99d7SShri Abhyankar           if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
2255628f99d7SShri Abhyankar             pv   = ba + bd[prow];
2256628f99d7SShri Abhyankar             pj   = nbj + bd[prow];
2257628f99d7SShri Abhyankar             mul1 = *pc1 * *pv;
2258628f99d7SShri Abhyankar             mul2 = *pc2 * *pv;
2259628f99d7SShri Abhyankar             mul3 = *pc3 * *pv;
2260628f99d7SShri Abhyankar             ++pv;
2261628f99d7SShri Abhyankar             *pc1 = mul1;
2262628f99d7SShri Abhyankar             *pc2 = mul2;
2263628f99d7SShri Abhyankar             *pc3 = mul3;
2264628f99d7SShri Abhyankar 
2265628f99d7SShri Abhyankar             nz_tmp = bi[prow + 1] - bd[prow] - 1;
2266628f99d7SShri Abhyankar             /* update this row based on pivot row */
2267628f99d7SShri Abhyankar             for (j = 0; j < nz_tmp; j++) {
2268628f99d7SShri Abhyankar               tmp = pv[j];
2269628f99d7SShri Abhyankar               idx = pj[j];
2270628f99d7SShri Abhyankar               rtmp11[idx] -= mul1 * tmp;
2271628f99d7SShri Abhyankar               rtmp22[idx] -= mul2 * tmp;
2272628f99d7SShri Abhyankar               rtmp33[idx] -= mul3 * tmp;
2273628f99d7SShri Abhyankar             }
22749566063dSJacob Faibussowitsch             PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp));
2275628f99d7SShri Abhyankar           }
2276628f99d7SShri Abhyankar           prow = *bjtmp++;
2277628f99d7SShri Abhyankar         }
2278628f99d7SShri Abhyankar 
2279628f99d7SShri Abhyankar         /* Now take care of diagonal 3x3 block in this set of rows */
2280628f99d7SShri Abhyankar         /* note: prow = row here */
2281628f99d7SShri Abhyankar         pc1 = rtmp11 + prow;
2282628f99d7SShri Abhyankar         pc2 = rtmp22 + prow;
2283628f99d7SShri Abhyankar         pc3 = rtmp33 + prow;
2284628f99d7SShri Abhyankar 
2285628f99d7SShri Abhyankar         sctx.pv = *pc1;
2286628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2287628f99d7SShri Abhyankar         rs      = 0.0;
2288628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2289628f99d7SShri Abhyankar           idx = pj[j];
2290628f99d7SShri Abhyankar           if (idx != row) rs += PetscAbsScalar(rtmp11[idx]);
2291628f99d7SShri Abhyankar         }
2292628f99d7SShri Abhyankar         sctx.rs = rs;
22939566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row));
229407b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2295628f99d7SShri Abhyankar 
2296628f99d7SShri Abhyankar         if (*pc2 != 0.0 || *pc3 != 0.0) {
2297628f99d7SShri Abhyankar           mul2   = (*pc2) / (*pc1);
2298628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc1);
2299628f99d7SShri Abhyankar           *pc2   = mul2;
2300628f99d7SShri Abhyankar           *pc3   = mul3;
2301628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2302628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2303628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2304628f99d7SShri Abhyankar             idx = pj[j];
2305628f99d7SShri Abhyankar             tmp = rtmp11[idx];
2306628f99d7SShri Abhyankar             rtmp22[idx] -= mul2 * tmp;
2307628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2308628f99d7SShri Abhyankar           }
23099566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp));
2310628f99d7SShri Abhyankar         }
2311628f99d7SShri Abhyankar         ++prow;
2312628f99d7SShri Abhyankar 
2313628f99d7SShri Abhyankar         pc2     = rtmp22 + prow;
2314628f99d7SShri Abhyankar         pc3     = rtmp33 + prow;
2315628f99d7SShri Abhyankar         sctx.pv = *pc2;
2316628f99d7SShri Abhyankar         pj      = bj + bi[prow];
2317628f99d7SShri Abhyankar         rs      = 0.0;
2318628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2319628f99d7SShri Abhyankar           idx = pj[j];
2320628f99d7SShri Abhyankar           if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]);
2321628f99d7SShri Abhyankar         }
2322628f99d7SShri Abhyankar         sctx.rs = rs;
23239566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1));
232407b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2325628f99d7SShri Abhyankar 
2326628f99d7SShri Abhyankar         if (*pc3 != 0.0) {
2327628f99d7SShri Abhyankar           mul3   = (*pc3) / (*pc2);
2328628f99d7SShri Abhyankar           *pc3   = mul3;
2329628f99d7SShri Abhyankar           pj     = nbj + bd[prow];
2330628f99d7SShri Abhyankar           nz_tmp = bi[prow + 1] - bd[prow] - 1;
2331628f99d7SShri Abhyankar           for (j = 0; j < nz_tmp; j++) {
2332628f99d7SShri Abhyankar             idx = pj[j];
2333628f99d7SShri Abhyankar             tmp = rtmp22[idx];
2334628f99d7SShri Abhyankar             rtmp33[idx] -= mul3 * tmp;
2335628f99d7SShri Abhyankar           }
23369566063dSJacob Faibussowitsch           PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp));
2337628f99d7SShri Abhyankar         }
2338628f99d7SShri Abhyankar 
2339628f99d7SShri Abhyankar         pj  = bj + bi[row];
2340628f99d7SShri Abhyankar         pc1 = ba + bi[row];
2341628f99d7SShri Abhyankar         pc2 = ba + bi[row + 1];
2342628f99d7SShri Abhyankar         pc3 = ba + bi[row + 2];
2343628f99d7SShri Abhyankar 
2344628f99d7SShri Abhyankar         sctx.pv         = rtmp33[row + 2];
2345628f99d7SShri Abhyankar         rs              = 0.0;
2346628f99d7SShri Abhyankar         rtmp11[row]     = 1.0 / rtmp11[row];
2347628f99d7SShri Abhyankar         rtmp22[row + 1] = 1.0 / rtmp22[row + 1];
2348628f99d7SShri Abhyankar         rtmp33[row + 2] = 1.0 / rtmp33[row + 2];
2349628f99d7SShri Abhyankar         /* copy row entries from dense representation to sparse */
2350628f99d7SShri Abhyankar         for (j = 0; j < nz; j++) {
2351628f99d7SShri Abhyankar           idx    = pj[j];
2352628f99d7SShri Abhyankar           pc1[j] = rtmp11[idx];
2353628f99d7SShri Abhyankar           pc2[j] = rtmp22[idx];
2354628f99d7SShri Abhyankar           pc3[j] = rtmp33[idx];
2355628f99d7SShri Abhyankar           if (idx != row + 2) rs += PetscAbsScalar(pc3[j]);
2356628f99d7SShri Abhyankar         }
2357628f99d7SShri Abhyankar 
2358628f99d7SShri Abhyankar         sctx.rs = rs;
23599566063dSJacob Faibussowitsch         PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2));
236007b50cabSHong Zhang         if (sctx.newshift) goto endofwhile;
2361628f99d7SShri Abhyankar         break;
2362628f99d7SShri Abhyankar 
2363d71ae5a4SJacob Faibussowitsch       default:
2364d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
2365628f99d7SShri Abhyankar       }
2366628f99d7SShri Abhyankar       row += nodesz; /* Update the row */
2367628f99d7SShri Abhyankar     }
2368628f99d7SShri Abhyankar   endofwhile:;
236907b50cabSHong Zhang   } while (sctx.newshift);
23709566063dSJacob Faibussowitsch   PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33));
23719566063dSJacob Faibussowitsch   PetscCall(PetscFree(tmp_vec2));
23729566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
23739566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
23749566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &c));
23752205254eSKarl Rupp 
2376d3ac4fa3SBarry Smith   (B)->ops->solve = MatSolve_SeqAIJ_inplace;
2377628f99d7SShri Abhyankar   /* do not set solve add, since MatSolve_Inode + Add is faster */
2378628f99d7SShri Abhyankar   C->ops->solvetranspose    = MatSolveTranspose_SeqAIJ_inplace;
2379628f99d7SShri Abhyankar   C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace;
2380628f99d7SShri Abhyankar   C->assembled              = PETSC_TRUE;
2381628f99d7SShri Abhyankar   C->preallocated           = PETSC_TRUE;
2382628f99d7SShri Abhyankar   if (sctx.nshift) {
2383f4db908eSBarry Smith     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
23849566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
2385f4db908eSBarry Smith     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
23869566063dSJacob Faibussowitsch       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
2387628f99d7SShri Abhyankar     }
2388628f99d7SShri Abhyankar   }
23899566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(C->cmap->n));
23909566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCheckInode(C));
2391628f99d7SShri Abhyankar   PetscFunctionReturn(0);
2392628f99d7SShri Abhyankar }
2393628f99d7SShri Abhyankar 
2394019b515eSShri Abhyankar /* ----------------------------------------------------------- */
2395d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
2396d71ae5a4SJacob Faibussowitsch {
2397019b515eSShri Abhyankar   Mat_SeqAIJ        *a     = (Mat_SeqAIJ *)A->data;
2398019b515eSShri Abhyankar   IS                 iscol = a->col, isrow = a->row;
2399019b515eSShri Abhyankar   const PetscInt    *r, *c, *rout, *cout;
24008758e1faSBarry Smith   PetscInt           i, j, n = A->rmap->n;
24018758e1faSBarry Smith   PetscInt           node_max, row, nsz, aii, i0, i1, nz;
24028758e1faSBarry Smith   const PetscInt    *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
2403019b515eSShri Abhyankar   PetscScalar       *x, *tmp, *tmps, tmp0, tmp1;
2404019b515eSShri Abhyankar   PetscScalar        sum1, sum2, sum3, sum4, sum5;
2405019b515eSShri Abhyankar   const MatScalar   *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
2406019b515eSShri Abhyankar   const PetscScalar *b;
2407019b515eSShri Abhyankar 
2408019b515eSShri Abhyankar   PetscFunctionBegin;
240908401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2410019b515eSShri Abhyankar   node_max = a->inode.node_count;
2411019b515eSShri Abhyankar   ns       = a->inode.size; /* Node Size array */
2412019b515eSShri Abhyankar 
24139566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
24149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayWrite(xx, &x));
2415019b515eSShri Abhyankar   tmp = a->solve_work;
2416019b515eSShri Abhyankar 
24179371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
24189371c9d4SSatish Balay   r = rout;
24199371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
24209371c9d4SSatish Balay   c = cout;
2421019b515eSShri Abhyankar 
2422019b515eSShri Abhyankar   /* forward solve the lower triangular */
2423019b515eSShri Abhyankar   tmps = tmp;
2424019b515eSShri Abhyankar   aa   = a_a;
2425019b515eSShri Abhyankar   aj   = a_j;
2426019b515eSShri Abhyankar   ad   = a->diag;
2427019b515eSShri Abhyankar 
2428019b515eSShri Abhyankar   for (i = 0, row = 0; i < node_max; ++i) {
2429019b515eSShri Abhyankar     nsz = ns[i];
2430019b515eSShri Abhyankar     aii = ai[row];
2431019b515eSShri Abhyankar     v1  = aa + aii;
2432019b515eSShri Abhyankar     vi  = aj + aii;
2433019b515eSShri Abhyankar     nz  = ai[row + 1] - ai[row];
2434019b515eSShri Abhyankar 
243598991853SShri Abhyankar     if (i < node_max - 1) {
243698991853SShri Abhyankar       /* Prefetch the indices for the next block */
243750d8bf02SJed Brown       PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
243898991853SShri Abhyankar       /* Prefetch the data for the next block */
243950d8bf02SJed Brown       PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
244098991853SShri Abhyankar     }
244198991853SShri Abhyankar 
2442019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2443019b515eSShri Abhyankar     case 1:
2444019b515eSShri Abhyankar       sum1 = b[r[row]];
2445019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2446019b515eSShri Abhyankar         i0   = vi[j];
2447019b515eSShri Abhyankar         i1   = vi[j + 1];
2448019b515eSShri Abhyankar         tmp0 = tmps[i0];
2449019b515eSShri Abhyankar         tmp1 = tmps[i1];
2450019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2451019b515eSShri Abhyankar       }
2452019b515eSShri Abhyankar       if (j == nz - 1) {
2453019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2454019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2455019b515eSShri Abhyankar       }
2456019b515eSShri Abhyankar       tmp[row++] = sum1;
2457019b515eSShri Abhyankar       break;
2458019b515eSShri Abhyankar     case 2:
2459019b515eSShri Abhyankar       sum1 = b[r[row]];
2460019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2461019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2462019b515eSShri Abhyankar 
2463019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2464019b515eSShri Abhyankar         i0   = vi[j];
2465019b515eSShri Abhyankar         i1   = vi[j + 1];
2466019b515eSShri Abhyankar         tmp0 = tmps[i0];
2467019b515eSShri Abhyankar         tmp1 = tmps[i1];
2468019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2469019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2470019b515eSShri Abhyankar       }
2471019b515eSShri Abhyankar       if (j == nz - 1) {
2472019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2473019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2474019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2475019b515eSShri Abhyankar       }
2476019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2477019b515eSShri Abhyankar       tmp[row++] = sum1;
2478019b515eSShri Abhyankar       tmp[row++] = sum2;
2479019b515eSShri Abhyankar       break;
2480019b515eSShri Abhyankar     case 3:
2481019b515eSShri Abhyankar       sum1 = b[r[row]];
2482019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2483019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2484019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2485019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2486019b515eSShri Abhyankar 
2487019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2488019b515eSShri Abhyankar         i0   = vi[j];
2489019b515eSShri Abhyankar         i1   = vi[j + 1];
2490019b515eSShri Abhyankar         tmp0 = tmps[i0];
2491019b515eSShri Abhyankar         tmp1 = tmps[i1];
2492019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2493019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2494019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2495019b515eSShri Abhyankar       }
2496019b515eSShri Abhyankar       if (j == nz - 1) {
2497019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2498019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2499019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2500019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2501019b515eSShri Abhyankar       }
2502019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2503019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2504019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2505019b515eSShri Abhyankar       tmp[row++] = sum1;
2506019b515eSShri Abhyankar       tmp[row++] = sum2;
2507019b515eSShri Abhyankar       tmp[row++] = sum3;
2508019b515eSShri Abhyankar       break;
2509019b515eSShri Abhyankar 
2510019b515eSShri Abhyankar     case 4:
2511019b515eSShri Abhyankar       sum1 = b[r[row]];
2512019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2513019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2514019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2515019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2516019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2517019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2518019b515eSShri Abhyankar 
2519019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2520019b515eSShri Abhyankar         i0   = vi[j];
2521019b515eSShri Abhyankar         i1   = vi[j + 1];
2522019b515eSShri Abhyankar         tmp0 = tmps[i0];
2523019b515eSShri Abhyankar         tmp1 = tmps[i1];
2524019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2525019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2526019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2527019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2528019b515eSShri Abhyankar       }
2529019b515eSShri Abhyankar       if (j == nz - 1) {
2530019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2531019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2532019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2533019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2534019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2535019b515eSShri Abhyankar       }
2536019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2537019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2538019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2539019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2540019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2541019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2542019b515eSShri Abhyankar 
2543019b515eSShri Abhyankar       tmp[row++] = sum1;
2544019b515eSShri Abhyankar       tmp[row++] = sum2;
2545019b515eSShri Abhyankar       tmp[row++] = sum3;
2546019b515eSShri Abhyankar       tmp[row++] = sum4;
2547019b515eSShri Abhyankar       break;
2548019b515eSShri Abhyankar     case 5:
2549019b515eSShri Abhyankar       sum1 = b[r[row]];
2550019b515eSShri Abhyankar       sum2 = b[r[row + 1]];
2551019b515eSShri Abhyankar       sum3 = b[r[row + 2]];
2552019b515eSShri Abhyankar       sum4 = b[r[row + 3]];
2553019b515eSShri Abhyankar       sum5 = b[r[row + 4]];
2554019b515eSShri Abhyankar       v2   = aa + ai[row + 1];
2555019b515eSShri Abhyankar       v3   = aa + ai[row + 2];
2556019b515eSShri Abhyankar       v4   = aa + ai[row + 3];
2557019b515eSShri Abhyankar       v5   = aa + ai[row + 4];
2558019b515eSShri Abhyankar 
2559019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2560019b515eSShri Abhyankar         i0   = vi[j];
2561019b515eSShri Abhyankar         i1   = vi[j + 1];
2562019b515eSShri Abhyankar         tmp0 = tmps[i0];
2563019b515eSShri Abhyankar         tmp1 = tmps[i1];
2564019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2565019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2566019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2567019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2568019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2569019b515eSShri Abhyankar       }
2570019b515eSShri Abhyankar       if (j == nz - 1) {
2571019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2572019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2573019b515eSShri Abhyankar         sum2 -= v2[j] * tmp0;
2574019b515eSShri Abhyankar         sum3 -= v3[j] * tmp0;
2575019b515eSShri Abhyankar         sum4 -= v4[j] * tmp0;
2576019b515eSShri Abhyankar         sum5 -= v5[j] * tmp0;
2577019b515eSShri Abhyankar       }
2578019b515eSShri Abhyankar 
2579019b515eSShri Abhyankar       sum2 -= v2[nz] * sum1;
2580019b515eSShri Abhyankar       sum3 -= v3[nz] * sum1;
2581019b515eSShri Abhyankar       sum4 -= v4[nz] * sum1;
2582019b515eSShri Abhyankar       sum5 -= v5[nz] * sum1;
2583019b515eSShri Abhyankar       sum3 -= v3[nz + 1] * sum2;
2584019b515eSShri Abhyankar       sum4 -= v4[nz + 1] * sum2;
2585019b515eSShri Abhyankar       sum5 -= v5[nz + 1] * sum2;
2586019b515eSShri Abhyankar       sum4 -= v4[nz + 2] * sum3;
2587019b515eSShri Abhyankar       sum5 -= v5[nz + 2] * sum3;
2588019b515eSShri Abhyankar       sum5 -= v5[nz + 3] * sum4;
2589019b515eSShri Abhyankar 
2590019b515eSShri Abhyankar       tmp[row++] = sum1;
2591019b515eSShri Abhyankar       tmp[row++] = sum2;
2592019b515eSShri Abhyankar       tmp[row++] = sum3;
2593019b515eSShri Abhyankar       tmp[row++] = sum4;
2594019b515eSShri Abhyankar       tmp[row++] = sum5;
2595019b515eSShri Abhyankar       break;
2596d71ae5a4SJacob Faibussowitsch     default:
2597d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2598019b515eSShri Abhyankar     }
2599019b515eSShri Abhyankar   }
2600019b515eSShri Abhyankar   /* backward solve the upper triangular */
2601019b515eSShri Abhyankar   for (i = node_max - 1, row = n - 1; i >= 0; i--) {
2602019b515eSShri Abhyankar     nsz = ns[i];
2603019b515eSShri Abhyankar     aii = ad[row + 1] + 1;
2604019b515eSShri Abhyankar     v1  = aa + aii;
2605019b515eSShri Abhyankar     vi  = aj + aii;
2606019b515eSShri Abhyankar     nz  = ad[row] - ad[row + 1] - 1;
260798991853SShri Abhyankar 
260898991853SShri Abhyankar     if (i > 0) {
260998991853SShri Abhyankar       /* Prefetch the indices for the next block */
261050d8bf02SJed Brown       PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
261198991853SShri Abhyankar       /* Prefetch the data for the next block */
261250d8bf02SJed Brown       PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
261398991853SShri Abhyankar     }
261498991853SShri Abhyankar 
2615019b515eSShri Abhyankar     switch (nsz) { /* Each loop in 'case' is unrolled */
2616019b515eSShri Abhyankar     case 1:
2617019b515eSShri Abhyankar       sum1 = tmp[row];
2618019b515eSShri Abhyankar 
2619019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2620019b515eSShri Abhyankar         i0   = vi[j];
2621019b515eSShri Abhyankar         i1   = vi[j + 1];
2622019b515eSShri Abhyankar         tmp0 = tmps[i0];
2623019b515eSShri Abhyankar         tmp1 = tmps[i1];
2624019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2625019b515eSShri Abhyankar       }
2626019b515eSShri Abhyankar       if (j == nz - 1) {
2627019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2628019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2629019b515eSShri Abhyankar       }
26309371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum1 * v1[nz];
26319371c9d4SSatish Balay       row--;
2632019b515eSShri Abhyankar       break;
2633019b515eSShri Abhyankar     case 2:
2634019b515eSShri Abhyankar       sum1 = tmp[row];
2635019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2636019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2637019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2638019b515eSShri Abhyankar         i0   = vi[j];
2639019b515eSShri Abhyankar         i1   = vi[j + 1];
2640019b515eSShri Abhyankar         tmp0 = tmps[i0];
2641019b515eSShri Abhyankar         tmp1 = tmps[i1];
2642019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2643019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2644019b515eSShri Abhyankar       }
2645019b515eSShri Abhyankar       if (j == nz - 1) {
2646019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2647019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2648019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2649019b515eSShri Abhyankar       }
2650019b515eSShri Abhyankar 
26519371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26529371c9d4SSatish Balay       row--;
2653019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
26549371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26559371c9d4SSatish Balay       row--;
2656019b515eSShri Abhyankar       break;
2657019b515eSShri Abhyankar     case 3:
2658019b515eSShri Abhyankar       sum1 = tmp[row];
2659019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2660019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2661019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2662019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2663019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2664019b515eSShri Abhyankar         i0   = vi[j];
2665019b515eSShri Abhyankar         i1   = vi[j + 1];
2666019b515eSShri Abhyankar         tmp0 = tmps[i0];
2667019b515eSShri Abhyankar         tmp1 = tmps[i1];
2668019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2669019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2670019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2671019b515eSShri Abhyankar       }
2672019b515eSShri Abhyankar       if (j == nz - 1) {
2673019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2674019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2675019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2676019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2677019b515eSShri Abhyankar       }
26789371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
26799371c9d4SSatish Balay       row--;
2680019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2681019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
26829371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
26839371c9d4SSatish Balay       row--;
2684019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
26859371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
26869371c9d4SSatish Balay       row--;
2687019b515eSShri Abhyankar 
2688019b515eSShri Abhyankar       break;
2689019b515eSShri Abhyankar     case 4:
2690019b515eSShri Abhyankar       sum1 = tmp[row];
2691019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2692019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2693019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2694019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2695019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2696019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2697019b515eSShri Abhyankar 
2698019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2699019b515eSShri Abhyankar         i0   = vi[j];
2700019b515eSShri Abhyankar         i1   = vi[j + 1];
2701019b515eSShri Abhyankar         tmp0 = tmps[i0];
2702019b515eSShri Abhyankar         tmp1 = tmps[i1];
2703019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2704019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2705019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2706019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2707019b515eSShri Abhyankar       }
2708019b515eSShri Abhyankar       if (j == nz - 1) {
2709019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2710019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2711019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2712019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2713019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2714019b515eSShri Abhyankar       }
2715019b515eSShri Abhyankar 
27169371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27179371c9d4SSatish Balay       row--;
2718019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2719019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2720019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
27219371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27229371c9d4SSatish Balay       row--;
2723019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2724019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
27259371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27269371c9d4SSatish Balay       row--;
2727019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
27289371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27299371c9d4SSatish Balay       row--;
2730019b515eSShri Abhyankar       break;
2731019b515eSShri Abhyankar     case 5:
2732019b515eSShri Abhyankar       sum1 = tmp[row];
2733019b515eSShri Abhyankar       sum2 = tmp[row - 1];
2734019b515eSShri Abhyankar       sum3 = tmp[row - 2];
2735019b515eSShri Abhyankar       sum4 = tmp[row - 3];
2736019b515eSShri Abhyankar       sum5 = tmp[row - 4];
2737019b515eSShri Abhyankar       v2   = aa + ad[row] + 1;
2738019b515eSShri Abhyankar       v3   = aa + ad[row - 1] + 1;
2739019b515eSShri Abhyankar       v4   = aa + ad[row - 2] + 1;
2740019b515eSShri Abhyankar       v5   = aa + ad[row - 3] + 1;
2741019b515eSShri Abhyankar       for (j = 0; j < nz - 1; j += 2) {
2742019b515eSShri Abhyankar         i0   = vi[j];
2743019b515eSShri Abhyankar         i1   = vi[j + 1];
2744019b515eSShri Abhyankar         tmp0 = tmps[i0];
2745019b515eSShri Abhyankar         tmp1 = tmps[i1];
2746019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2747019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2748019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2749019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2750019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2751019b515eSShri Abhyankar       }
2752019b515eSShri Abhyankar       if (j == nz - 1) {
2753019b515eSShri Abhyankar         tmp0 = tmps[vi[j]];
2754019b515eSShri Abhyankar         sum1 -= v1[j] * tmp0;
2755019b515eSShri Abhyankar         sum2 -= v2[j + 1] * tmp0;
2756019b515eSShri Abhyankar         sum3 -= v3[j + 2] * tmp0;
2757019b515eSShri Abhyankar         sum4 -= v4[j + 3] * tmp0;
2758019b515eSShri Abhyankar         sum5 -= v5[j + 4] * tmp0;
2759019b515eSShri Abhyankar       }
2760019b515eSShri Abhyankar 
27619371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
27629371c9d4SSatish Balay       row--;
2763019b515eSShri Abhyankar       sum2 -= v2[0] * tmp0;
2764019b515eSShri Abhyankar       sum3 -= v3[1] * tmp0;
2765019b515eSShri Abhyankar       sum4 -= v4[2] * tmp0;
2766019b515eSShri Abhyankar       sum5 -= v5[3] * tmp0;
27679371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
27689371c9d4SSatish Balay       row--;
2769019b515eSShri Abhyankar       sum3 -= v3[0] * tmp0;
2770019b515eSShri Abhyankar       sum4 -= v4[1] * tmp0;
2771019b515eSShri Abhyankar       sum5 -= v5[2] * tmp0;
27729371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
27739371c9d4SSatish Balay       row--;
2774019b515eSShri Abhyankar       sum4 -= v4[0] * tmp0;
2775019b515eSShri Abhyankar       sum5 -= v5[1] * tmp0;
27769371c9d4SSatish Balay       tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
27779371c9d4SSatish Balay       row--;
2778019b515eSShri Abhyankar       sum5 -= v5[0] * tmp0;
27799371c9d4SSatish Balay       x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
27809371c9d4SSatish Balay       row--;
2781019b515eSShri Abhyankar       break;
2782d71ae5a4SJacob Faibussowitsch     default:
2783d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2784019b515eSShri Abhyankar     }
2785019b515eSShri Abhyankar   }
27869566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
27879566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
27889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
27899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayWrite(xx, &x));
27909566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
2791019b515eSShri Abhyankar   PetscFunctionReturn(0);
2792019b515eSShri Abhyankar }
2793019b515eSShri Abhyankar 
27944c1414c8SBarry Smith /*
27954c1414c8SBarry Smith      Makes a longer coloring[] array and calls the usual code with that
27964c1414c8SBarry Smith */
2797d71ae5a4SJacob Faibussowitsch PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2798d71ae5a4SJacob Faibussowitsch {
27994c1414c8SBarry Smith   Mat_SeqAIJ      *a = (Mat_SeqAIJ *)mat->data;
2800d0f46423SBarry Smith   PetscInt         n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row;
28014c1414c8SBarry Smith   PetscInt        *colorused, i;
28024c1414c8SBarry Smith   ISColoringValue *newcolor;
28034c1414c8SBarry Smith 
28044c1414c8SBarry Smith   PetscFunctionBegin;
280508401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
28069566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(n + 1, &newcolor));
28074c1414c8SBarry Smith   /* loop over inodes, marking a color for each column*/
28084c1414c8SBarry Smith   row = 0;
28094c1414c8SBarry Smith   for (i = 0; i < m; i++) {
2810ad540459SPierre Jolivet     for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors;
28114c1414c8SBarry Smith   }
28124c1414c8SBarry Smith 
28134c1414c8SBarry Smith   /* eliminate unneeded colors */
28149566063dSJacob Faibussowitsch   PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2815ad540459SPierre Jolivet   for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
28164c1414c8SBarry Smith 
2817ad540459SPierre Jolivet   for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
28184c1414c8SBarry Smith   ncolors = colorused[5 * ncolors - 1];
2819ad540459SPierre Jolivet   for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1;
28209566063dSJacob Faibussowitsch   PetscCall(PetscFree(colorused));
28219566063dSJacob Faibussowitsch   PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
28229566063dSJacob Faibussowitsch   PetscCall(PetscFree(coloring));
28234c1414c8SBarry Smith   PetscFunctionReturn(0);
28244c1414c8SBarry Smith }
28254c1414c8SBarry Smith 
2826af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
28272af78befSBarry Smith 
2828d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
2829d71ae5a4SJacob Faibussowitsch {
28302af78befSBarry Smith   Mat_SeqAIJ        *a    = (Mat_SeqAIJ *)A->data;
28317aaeff0aSMatthew G. Knepley   PetscScalar        sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
28325850ef23SBarry Smith   MatScalar         *ibdiag, *bdiag, work[25], *t;
2833a8b09249SBarry Smith   PetscScalar       *x, tmp4, tmp5, x1, x2, x3, x4, x5;
28347aaeff0aSMatthew G. Knepley   const MatScalar   *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
28355850ef23SBarry Smith   const PetscScalar *xb, *b;
28367b6c816cSBarry Smith   PetscReal          zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
28378758e1faSBarry Smith   PetscInt           n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2;
28388758e1faSBarry Smith   PetscInt           sz, k, ipvt[5];
28397b6c816cSBarry Smith   PetscBool          allowzeropivot, zeropivotdetected;
28408758e1faSBarry Smith   const PetscInt    *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i;
28412af78befSBarry Smith 
28422af78befSBarry Smith   PetscFunctionBegin;
2843a455e926SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
284408401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
284508401ef6SPierre Jolivet   PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
284608401ef6SPierre Jolivet   PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
28472af78befSBarry Smith 
284871f1c65dSBarry Smith   if (!a->inode.ibdiagvalid) {
28492af78befSBarry Smith     if (!a->inode.ibdiag) {
28502af78befSBarry Smith       /* calculate space needed for diagonal blocks */
2851ad540459SPierre Jolivet       for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i];
2852f0d39aaaSBarry Smith       a->inode.bdiagsize = cnt;
28532205254eSKarl Rupp 
28549566063dSJacob Faibussowitsch       PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
285571f1c65dSBarry Smith     }
285671f1c65dSBarry Smith 
285771f1c65dSBarry Smith     /* copy over the diagonal blocks and invert them */
28582af78befSBarry Smith     ibdiag = a->inode.ibdiag;
28592af78befSBarry Smith     bdiag  = a->inode.bdiag;
28602af78befSBarry Smith     cnt    = 0;
28612af78befSBarry Smith     for (i = 0, row = 0; i < m; i++) {
28622af78befSBarry Smith       for (j = 0; j < sizes[i]; j++) {
2863ad540459SPierre Jolivet         for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k];
28642af78befSBarry Smith       }
28659566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i]));
28662af78befSBarry Smith 
28672af78befSBarry Smith       switch (sizes[i]) {
28682af78befSBarry Smith       case 1:
28692af78befSBarry Smith         /* Create matrix data structure */
28708e0e2a9aSHong Zhang         if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
28718e0e2a9aSHong Zhang           if (allowzeropivot) {
28727b6c816cSBarry Smith             A->factorerrortype             = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28737b6c816cSBarry Smith             A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
28747b6c816cSBarry Smith             A->factorerror_zeropivot_row   = row;
28759566063dSJacob Faibussowitsch             PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
287698921bdaSJacob Faibussowitsch           } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
28778e0e2a9aSHong Zhang         }
287864c62002SMatthew Knepley         ibdiag[cnt] = 1.0 / ibdiag[cnt];
28792af78befSBarry Smith         break;
28802af78befSBarry Smith       case 2:
28819566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28827b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28832af78befSBarry Smith         break;
28842af78befSBarry Smith       case 3:
28859566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28867b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28872af78befSBarry Smith         break;
28882af78befSBarry Smith       case 4:
28899566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
28907b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28912af78befSBarry Smith         break;
28922af78befSBarry Smith       case 5:
28939566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
28947b6c816cSBarry Smith         if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
28952af78befSBarry Smith         break;
2896d71ae5a4SJacob Faibussowitsch       default:
2897d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
28982af78befSBarry Smith       }
28992af78befSBarry Smith       cnt += sizes[i] * sizes[i];
29002af78befSBarry Smith       row += sizes[i];
29012af78befSBarry Smith     }
290271f1c65dSBarry Smith     a->inode.ibdiagvalid = PETSC_TRUE;
29032af78befSBarry Smith   }
29042af78befSBarry Smith   ibdiag = a->inode.ibdiag;
29052af78befSBarry Smith   bdiag  = a->inode.bdiag;
29065850ef23SBarry Smith   t      = a->inode.ssor_work;
29072af78befSBarry Smith 
29089566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
29099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
29105850ef23SBarry Smith   /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
29115850ef23SBarry Smith   if (flag & SOR_ZERO_INITIAL_GUESS) {
29122af78befSBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
29138862d2efSBarry Smith       for (i = 0, row = 0; i < m; i++) {
29148862d2efSBarry Smith         sz  = diag[row] - ii[row];
29158862d2efSBarry Smith         v1  = a->a + ii[row];
29168862d2efSBarry Smith         idx = a->j + ii[row];
29178862d2efSBarry Smith 
29184108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
29198862d2efSBarry Smith         switch (sizes[i]) {
29208862d2efSBarry Smith         case 1:
29218862d2efSBarry Smith 
29228862d2efSBarry Smith           sum1 = b[row];
29238862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
29248862d2efSBarry Smith             i1 = idx[0];
29258862d2efSBarry Smith             i2 = idx[1];
29268862d2efSBarry Smith             idx += 2;
29278862d2efSBarry Smith             tmp0 = x[i1];
29288862d2efSBarry Smith             tmp1 = x[i2];
29299371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29309371c9d4SSatish Balay             v1 += 2;
29318862d2efSBarry Smith           }
29328862d2efSBarry Smith 
29338862d2efSBarry Smith           if (n == sz - 1) {
2934f0d39aaaSBarry Smith             tmp0 = x[*idx];
2935f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
29368862d2efSBarry Smith           }
29375850ef23SBarry Smith           t[row]   = sum1;
29388862d2efSBarry Smith           x[row++] = sum1 * (*ibdiag++);
29398862d2efSBarry Smith           break;
2940f0d39aaaSBarry Smith         case 2:
2941f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2942f0d39aaaSBarry Smith           sum1 = b[row];
2943f0d39aaaSBarry Smith           sum2 = b[row + 1];
2944f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2945f0d39aaaSBarry Smith             i1 = idx[0];
2946f0d39aaaSBarry Smith             i2 = idx[1];
2947f0d39aaaSBarry Smith             idx += 2;
2948f0d39aaaSBarry Smith             tmp0 = x[i1];
2949f0d39aaaSBarry Smith             tmp1 = x[i2];
29509371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29519371c9d4SSatish Balay             v1 += 2;
29529371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29539371c9d4SSatish Balay             v2 += 2;
2954f0d39aaaSBarry Smith           }
2955f0d39aaaSBarry Smith 
2956f0d39aaaSBarry Smith           if (n == sz - 1) {
2957f0d39aaaSBarry Smith             tmp0 = x[*idx];
2958f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2959f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2960f0d39aaaSBarry Smith           }
29615850ef23SBarry Smith           t[row]     = sum1;
29625850ef23SBarry Smith           t[row + 1] = sum2;
2963f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2964f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2965f0d39aaaSBarry Smith           ibdiag += 4;
2966f0d39aaaSBarry Smith           break;
2967f0d39aaaSBarry Smith         case 3:
2968f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
2969f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
2970f0d39aaaSBarry Smith           sum1 = b[row];
2971f0d39aaaSBarry Smith           sum2 = b[row + 1];
2972f0d39aaaSBarry Smith           sum3 = b[row + 2];
2973f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
2974f0d39aaaSBarry Smith             i1 = idx[0];
2975f0d39aaaSBarry Smith             i2 = idx[1];
2976f0d39aaaSBarry Smith             idx += 2;
2977f0d39aaaSBarry Smith             tmp0 = x[i1];
2978f0d39aaaSBarry Smith             tmp1 = x[i2];
29799371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29809371c9d4SSatish Balay             v1 += 2;
29819371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29829371c9d4SSatish Balay             v2 += 2;
29839371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
29849371c9d4SSatish Balay             v3 += 2;
2985f0d39aaaSBarry Smith           }
2986f0d39aaaSBarry Smith 
2987f0d39aaaSBarry Smith           if (n == sz - 1) {
2988f0d39aaaSBarry Smith             tmp0 = x[*idx];
2989f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
2990f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
2991f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
2992f0d39aaaSBarry Smith           }
29935850ef23SBarry Smith           t[row]     = sum1;
29945850ef23SBarry Smith           t[row + 1] = sum2;
29955850ef23SBarry Smith           t[row + 2] = sum3;
2996f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2997f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2998f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2999f0d39aaaSBarry Smith           ibdiag += 9;
3000f0d39aaaSBarry Smith           break;
3001f0d39aaaSBarry Smith         case 4:
3002f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3003f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3004f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3005f0d39aaaSBarry Smith           sum1 = b[row];
3006f0d39aaaSBarry Smith           sum2 = b[row + 1];
3007f0d39aaaSBarry Smith           sum3 = b[row + 2];
3008f0d39aaaSBarry Smith           sum4 = b[row + 3];
3009f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3010f0d39aaaSBarry Smith             i1 = idx[0];
3011f0d39aaaSBarry Smith             i2 = idx[1];
3012f0d39aaaSBarry Smith             idx += 2;
3013f0d39aaaSBarry Smith             tmp0 = x[i1];
3014f0d39aaaSBarry Smith             tmp1 = x[i2];
30159371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30169371c9d4SSatish Balay             v1 += 2;
30179371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30189371c9d4SSatish Balay             v2 += 2;
30199371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30209371c9d4SSatish Balay             v3 += 2;
30219371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30229371c9d4SSatish Balay             v4 += 2;
3023f0d39aaaSBarry Smith           }
3024f0d39aaaSBarry Smith 
3025f0d39aaaSBarry Smith           if (n == sz - 1) {
3026f0d39aaaSBarry Smith             tmp0 = x[*idx];
3027f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3028f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3029f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3030f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3031f0d39aaaSBarry Smith           }
30325850ef23SBarry Smith           t[row]     = sum1;
30335850ef23SBarry Smith           t[row + 1] = sum2;
30345850ef23SBarry Smith           t[row + 2] = sum3;
30355850ef23SBarry Smith           t[row + 3] = sum4;
3036f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3037f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3038f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3039f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
3040f0d39aaaSBarry Smith           ibdiag += 16;
3041f0d39aaaSBarry Smith           break;
3042f0d39aaaSBarry Smith         case 5:
3043f0d39aaaSBarry Smith           v2   = a->a + ii[row + 1];
3044f0d39aaaSBarry Smith           v3   = a->a + ii[row + 2];
3045f0d39aaaSBarry Smith           v4   = a->a + ii[row + 3];
3046f0d39aaaSBarry Smith           v5   = a->a + ii[row + 4];
3047f0d39aaaSBarry Smith           sum1 = b[row];
3048f0d39aaaSBarry Smith           sum2 = b[row + 1];
3049f0d39aaaSBarry Smith           sum3 = b[row + 2];
3050f0d39aaaSBarry Smith           sum4 = b[row + 3];
3051f0d39aaaSBarry Smith           sum5 = b[row + 4];
3052f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3053f0d39aaaSBarry Smith             i1 = idx[0];
3054f0d39aaaSBarry Smith             i2 = idx[1];
3055f0d39aaaSBarry Smith             idx += 2;
3056f0d39aaaSBarry Smith             tmp0 = x[i1];
3057f0d39aaaSBarry Smith             tmp1 = x[i2];
30589371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30599371c9d4SSatish Balay             v1 += 2;
30609371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30619371c9d4SSatish Balay             v2 += 2;
30629371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30639371c9d4SSatish Balay             v3 += 2;
30649371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30659371c9d4SSatish Balay             v4 += 2;
30669371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
30679371c9d4SSatish Balay             v5 += 2;
3068f0d39aaaSBarry Smith           }
3069f0d39aaaSBarry Smith 
3070f0d39aaaSBarry Smith           if (n == sz - 1) {
3071f0d39aaaSBarry Smith             tmp0 = x[*idx];
3072f0d39aaaSBarry Smith             sum1 -= v1[0] * tmp0;
3073f0d39aaaSBarry Smith             sum2 -= v2[0] * tmp0;
3074f0d39aaaSBarry Smith             sum3 -= v3[0] * tmp0;
3075f0d39aaaSBarry Smith             sum4 -= v4[0] * tmp0;
3076f0d39aaaSBarry Smith             sum5 -= v5[0] * tmp0;
3077f0d39aaaSBarry Smith           }
30785850ef23SBarry Smith           t[row]     = sum1;
30795850ef23SBarry Smith           t[row + 1] = sum2;
30805850ef23SBarry Smith           t[row + 2] = sum3;
30815850ef23SBarry Smith           t[row + 3] = sum4;
30825850ef23SBarry Smith           t[row + 4] = sum5;
3083f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3084f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3085f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3086f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3087f0d39aaaSBarry Smith           x[row++]   = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3088f0d39aaaSBarry Smith           ibdiag += 25;
3089f0d39aaaSBarry Smith           break;
3090d71ae5a4SJacob Faibussowitsch         default:
3091d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
30928862d2efSBarry Smith         }
30932af78befSBarry Smith       }
30942af78befSBarry Smith 
30955850ef23SBarry Smith       xb = t;
30969566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
30972af78befSBarry Smith     } else xb = b;
30982af78befSBarry Smith     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3099f0d39aaaSBarry Smith       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3100d0f46423SBarry Smith       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3101f0d39aaaSBarry Smith         ibdiag -= sizes[i] * sizes[i];
31028862d2efSBarry Smith         sz  = ii[row + 1] - diag[row] - 1;
31038862d2efSBarry Smith         v1  = a->a + diag[row] + 1;
31048862d2efSBarry Smith         idx = a->j + diag[row] + 1;
31052af78befSBarry Smith 
31064108e4d5SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
31078862d2efSBarry Smith         switch (sizes[i]) {
31088862d2efSBarry Smith         case 1:
31098862d2efSBarry Smith 
31108862d2efSBarry Smith           sum1 = xb[row];
31118862d2efSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
31128862d2efSBarry Smith             i1 = idx[0];
31138862d2efSBarry Smith             i2 = idx[1];
31148862d2efSBarry Smith             idx += 2;
31158862d2efSBarry Smith             tmp0 = x[i1];
31168862d2efSBarry Smith             tmp1 = x[i2];
31179371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31189371c9d4SSatish Balay             v1 += 2;
31198862d2efSBarry Smith           }
31208862d2efSBarry Smith 
31218862d2efSBarry Smith           if (n == sz - 1) {
3122f0d39aaaSBarry Smith             tmp0 = x[*idx];
3123f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
31248862d2efSBarry Smith           }
3125f0d39aaaSBarry Smith           x[row--] = sum1 * (*ibdiag);
3126f0d39aaaSBarry Smith           break;
3127f0d39aaaSBarry Smith 
3128f0d39aaaSBarry Smith         case 2:
3129f0d39aaaSBarry Smith 
3130f0d39aaaSBarry Smith           sum1 = xb[row];
3131f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3132f0d39aaaSBarry Smith           /* note that sum1 is associated with the second of the two rows */
3133f0d39aaaSBarry Smith           v2 = a->a + diag[row - 1] + 2;
3134f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3135f0d39aaaSBarry Smith             i1 = idx[0];
3136f0d39aaaSBarry Smith             i2 = idx[1];
3137f0d39aaaSBarry Smith             idx += 2;
3138f0d39aaaSBarry Smith             tmp0 = x[i1];
3139f0d39aaaSBarry Smith             tmp1 = x[i2];
31409371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31419371c9d4SSatish Balay             v1 += 2;
31429371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31439371c9d4SSatish Balay             v2 += 2;
3144f0d39aaaSBarry Smith           }
3145f0d39aaaSBarry Smith 
3146f0d39aaaSBarry Smith           if (n == sz - 1) {
3147f0d39aaaSBarry Smith             tmp0 = x[*idx];
3148f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3149f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3150f0d39aaaSBarry Smith           }
3151f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3152f0d39aaaSBarry Smith           x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3153f0d39aaaSBarry Smith           break;
3154f0d39aaaSBarry Smith         case 3:
3155f0d39aaaSBarry Smith 
3156f0d39aaaSBarry Smith           sum1 = xb[row];
3157f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3158f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3159f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3160f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3161f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3162f0d39aaaSBarry Smith             i1 = idx[0];
3163f0d39aaaSBarry Smith             i2 = idx[1];
3164f0d39aaaSBarry Smith             idx += 2;
3165f0d39aaaSBarry Smith             tmp0 = x[i1];
3166f0d39aaaSBarry Smith             tmp1 = x[i2];
31679371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31689371c9d4SSatish Balay             v1 += 2;
31699371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31709371c9d4SSatish Balay             v2 += 2;
31719371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31729371c9d4SSatish Balay             v3 += 2;
3173f0d39aaaSBarry Smith           }
3174f0d39aaaSBarry Smith 
3175f0d39aaaSBarry Smith           if (n == sz - 1) {
3176f0d39aaaSBarry Smith             tmp0 = x[*idx];
3177f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3178f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3179f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3180f0d39aaaSBarry Smith           }
3181f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3182f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3183f0d39aaaSBarry Smith           x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3184f0d39aaaSBarry Smith           break;
3185f0d39aaaSBarry Smith         case 4:
3186f0d39aaaSBarry Smith 
3187f0d39aaaSBarry Smith           sum1 = xb[row];
3188f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3189f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3190f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3191f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3192f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3193f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3194f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3195f0d39aaaSBarry Smith             i1 = idx[0];
3196f0d39aaaSBarry Smith             i2 = idx[1];
3197f0d39aaaSBarry Smith             idx += 2;
3198f0d39aaaSBarry Smith             tmp0 = x[i1];
3199f0d39aaaSBarry Smith             tmp1 = x[i2];
32009371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32019371c9d4SSatish Balay             v1 += 2;
32029371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32039371c9d4SSatish Balay             v2 += 2;
32049371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32059371c9d4SSatish Balay             v3 += 2;
32069371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32079371c9d4SSatish Balay             v4 += 2;
3208f0d39aaaSBarry Smith           }
3209f0d39aaaSBarry Smith 
3210f0d39aaaSBarry Smith           if (n == sz - 1) {
3211f0d39aaaSBarry Smith             tmp0 = x[*idx];
3212f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3213f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3214f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3215f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3216f0d39aaaSBarry Smith           }
3217f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3218f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3219f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3220f0d39aaaSBarry Smith           x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3221f0d39aaaSBarry Smith           break;
3222f0d39aaaSBarry Smith         case 5:
3223f0d39aaaSBarry Smith 
3224f0d39aaaSBarry Smith           sum1 = xb[row];
3225f0d39aaaSBarry Smith           sum2 = xb[row - 1];
3226f0d39aaaSBarry Smith           sum3 = xb[row - 2];
3227f0d39aaaSBarry Smith           sum4 = xb[row - 3];
3228f0d39aaaSBarry Smith           sum5 = xb[row - 4];
3229f0d39aaaSBarry Smith           v2   = a->a + diag[row - 1] + 2;
3230f0d39aaaSBarry Smith           v3   = a->a + diag[row - 2] + 3;
3231f0d39aaaSBarry Smith           v4   = a->a + diag[row - 3] + 4;
3232f0d39aaaSBarry Smith           v5   = a->a + diag[row - 4] + 5;
3233f0d39aaaSBarry Smith           for (n = 0; n < sz - 1; n += 2) {
3234f0d39aaaSBarry Smith             i1 = idx[0];
3235f0d39aaaSBarry Smith             i2 = idx[1];
3236f0d39aaaSBarry Smith             idx += 2;
3237f0d39aaaSBarry Smith             tmp0 = x[i1];
3238f0d39aaaSBarry Smith             tmp1 = x[i2];
32399371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32409371c9d4SSatish Balay             v1 += 2;
32419371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
32429371c9d4SSatish Balay             v2 += 2;
32439371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
32449371c9d4SSatish Balay             v3 += 2;
32459371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
32469371c9d4SSatish Balay             v4 += 2;
32479371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
32489371c9d4SSatish Balay             v5 += 2;
3249f0d39aaaSBarry Smith           }
3250f0d39aaaSBarry Smith 
3251f0d39aaaSBarry Smith           if (n == sz - 1) {
3252f0d39aaaSBarry Smith             tmp0 = x[*idx];
3253f0d39aaaSBarry Smith             sum1 -= *v1 * tmp0;
3254f0d39aaaSBarry Smith             sum2 -= *v2 * tmp0;
3255f0d39aaaSBarry Smith             sum3 -= *v3 * tmp0;
3256f0d39aaaSBarry Smith             sum4 -= *v4 * tmp0;
3257f0d39aaaSBarry Smith             sum5 -= *v5 * tmp0;
3258f0d39aaaSBarry Smith           }
3259f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3260f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3261f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3262f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3263f0d39aaaSBarry Smith           x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
32648862d2efSBarry Smith           break;
3265d71ae5a4SJacob Faibussowitsch         default:
3266d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
32678862d2efSBarry Smith         }
32682af78befSBarry Smith       }
32692af78befSBarry Smith 
32709566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(a->nz));
32712af78befSBarry Smith     }
32722af78befSBarry Smith     its--;
32735850ef23SBarry Smith   }
32745850ef23SBarry Smith   while (its--) {
32755850ef23SBarry Smith     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
32769371c9d4SSatish Balay       for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) {
3277d876e2b0SMark Adams         sz  = diag[row] - ii[row];
32785850ef23SBarry Smith         v1  = a->a + ii[row];
32795850ef23SBarry Smith         idx = a->j + ii[row];
32805850ef23SBarry Smith         /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
32815850ef23SBarry Smith         switch (sizes[i]) {
32825850ef23SBarry Smith         case 1:
32835850ef23SBarry Smith           sum1 = b[row];
32845850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
32855850ef23SBarry Smith             i1 = idx[0];
32865850ef23SBarry Smith             i2 = idx[1];
32875850ef23SBarry Smith             idx += 2;
32885850ef23SBarry Smith             tmp0 = x[i1];
32895850ef23SBarry Smith             tmp1 = x[i2];
32909371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
32919371c9d4SSatish Balay             v1 += 2;
32925850ef23SBarry Smith           }
32935850ef23SBarry Smith           if (n == sz - 1) {
3294d876e2b0SMark Adams             tmp0 = x[*idx++];
3295d876e2b0SMark Adams             sum1 -= *v1 * tmp0;
3296d876e2b0SMark Adams             v1++;
3297d876e2b0SMark Adams           }
3298d876e2b0SMark Adams           t[row] = sum1;
3299d876e2b0SMark Adams           sz     = ii[row + 1] - diag[row] - 1;
3300d876e2b0SMark Adams           idx    = a->j + diag[row] + 1;
3301d876e2b0SMark Adams           v1 += 1;
3302d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3303d876e2b0SMark Adams             i1 = idx[0];
3304d876e2b0SMark Adams             i2 = idx[1];
3305d876e2b0SMark Adams             idx += 2;
3306d876e2b0SMark Adams             tmp0 = x[i1];
3307d876e2b0SMark Adams             tmp1 = x[i2];
33089371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33099371c9d4SSatish Balay             v1 += 2;
3310d876e2b0SMark Adams           }
3311d876e2b0SMark Adams           if (n == sz - 1) {
3312d876e2b0SMark Adams             tmp0 = x[*idx++];
33135850ef23SBarry Smith             sum1 -= *v1 * tmp0;
33145850ef23SBarry Smith           }
33155850ef23SBarry Smith           /* in MatSOR_SeqAIJ this line would be
33165850ef23SBarry Smith            *
33175850ef23SBarry Smith            * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
33185850ef23SBarry Smith            *
33195850ef23SBarry Smith            * but omega == 1, so this becomes
33205850ef23SBarry Smith            *
3321d876e2b0SMark Adams            * x[row] = sum1*(*ibdiag++);
33225850ef23SBarry Smith            *
33235850ef23SBarry Smith            */
3324d876e2b0SMark Adams           x[row] = sum1 * (*ibdiag);
33255850ef23SBarry Smith           break;
33265850ef23SBarry Smith         case 2:
33275850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33285850ef23SBarry Smith           sum1 = b[row];
33295850ef23SBarry Smith           sum2 = b[row + 1];
33305850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33315850ef23SBarry Smith             i1 = idx[0];
33325850ef23SBarry Smith             i2 = idx[1];
33335850ef23SBarry Smith             idx += 2;
33345850ef23SBarry Smith             tmp0 = x[i1];
33355850ef23SBarry Smith             tmp1 = x[i2];
33369371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33379371c9d4SSatish Balay             v1 += 2;
33389371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33399371c9d4SSatish Balay             v2 += 2;
33405850ef23SBarry Smith           }
3341d876e2b0SMark Adams           if (n == sz - 1) {
3342d876e2b0SMark Adams             tmp0 = x[*idx++];
3343d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3344d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
33459371c9d4SSatish Balay             v1++;
33469371c9d4SSatish Balay             v2++;
3347d876e2b0SMark Adams           }
3348d876e2b0SMark Adams           t[row]     = sum1;
3349d876e2b0SMark Adams           t[row + 1] = sum2;
3350d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 2;
3351d876e2b0SMark Adams           idx        = a->j + diag[row] + 2;
3352d876e2b0SMark Adams           v1 += 2;
3353d876e2b0SMark Adams           v2 += 2;
3354d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3355d876e2b0SMark Adams             i1 = idx[0];
3356d876e2b0SMark Adams             i2 = idx[1];
3357d876e2b0SMark Adams             idx += 2;
3358d876e2b0SMark Adams             tmp0 = x[i1];
3359d876e2b0SMark Adams             tmp1 = x[i2];
33609371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33619371c9d4SSatish Balay             v1 += 2;
33629371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33639371c9d4SSatish Balay             v2 += 2;
3364d876e2b0SMark Adams           }
33655850ef23SBarry Smith           if (n == sz - 1) {
33665850ef23SBarry Smith             tmp0 = x[*idx];
33675850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
33685850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
33695850ef23SBarry Smith           }
3370d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[2];
3371d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
33725850ef23SBarry Smith           break;
33735850ef23SBarry Smith         case 3:
33745850ef23SBarry Smith           v2   = a->a + ii[row + 1];
33755850ef23SBarry Smith           v3   = a->a + ii[row + 2];
33765850ef23SBarry Smith           sum1 = b[row];
33775850ef23SBarry Smith           sum2 = b[row + 1];
33785850ef23SBarry Smith           sum3 = b[row + 2];
33795850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
33805850ef23SBarry Smith             i1 = idx[0];
33815850ef23SBarry Smith             i2 = idx[1];
33825850ef23SBarry Smith             idx += 2;
33835850ef23SBarry Smith             tmp0 = x[i1];
33845850ef23SBarry Smith             tmp1 = x[i2];
33859371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33869371c9d4SSatish Balay             v1 += 2;
33879371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
33889371c9d4SSatish Balay             v2 += 2;
33899371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
33909371c9d4SSatish Balay             v3 += 2;
33915850ef23SBarry Smith           }
3392d876e2b0SMark Adams           if (n == sz - 1) {
3393d876e2b0SMark Adams             tmp0 = x[*idx++];
3394d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3395d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3396d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
33979371c9d4SSatish Balay             v1++;
33989371c9d4SSatish Balay             v2++;
33999371c9d4SSatish Balay             v3++;
3400d876e2b0SMark Adams           }
3401d876e2b0SMark Adams           t[row]     = sum1;
3402d876e2b0SMark Adams           t[row + 1] = sum2;
3403d876e2b0SMark Adams           t[row + 2] = sum3;
3404d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 3;
3405d876e2b0SMark Adams           idx        = a->j + diag[row] + 3;
3406d876e2b0SMark Adams           v1 += 3;
3407d876e2b0SMark Adams           v2 += 3;
3408d876e2b0SMark Adams           v3 += 3;
3409d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3410d876e2b0SMark Adams             i1 = idx[0];
3411d876e2b0SMark Adams             i2 = idx[1];
3412d876e2b0SMark Adams             idx += 2;
3413d876e2b0SMark Adams             tmp0 = x[i1];
3414d876e2b0SMark Adams             tmp1 = x[i2];
34159371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34169371c9d4SSatish Balay             v1 += 2;
34179371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34189371c9d4SSatish Balay             v2 += 2;
34199371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34209371c9d4SSatish Balay             v3 += 2;
3421d876e2b0SMark Adams           }
34225850ef23SBarry Smith           if (n == sz - 1) {
34235850ef23SBarry Smith             tmp0 = x[*idx];
34245850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34255850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34265850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34275850ef23SBarry Smith           }
3428d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3429d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3430d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
34315850ef23SBarry Smith           break;
34325850ef23SBarry Smith         case 4:
34335850ef23SBarry Smith           v2   = a->a + ii[row + 1];
34345850ef23SBarry Smith           v3   = a->a + ii[row + 2];
34355850ef23SBarry Smith           v4   = a->a + ii[row + 3];
34365850ef23SBarry Smith           sum1 = b[row];
34375850ef23SBarry Smith           sum2 = b[row + 1];
34385850ef23SBarry Smith           sum3 = b[row + 2];
34395850ef23SBarry Smith           sum4 = b[row + 3];
34405850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
34415850ef23SBarry Smith             i1 = idx[0];
34425850ef23SBarry Smith             i2 = idx[1];
34435850ef23SBarry Smith             idx += 2;
34445850ef23SBarry Smith             tmp0 = x[i1];
34455850ef23SBarry Smith             tmp1 = x[i2];
34469371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34479371c9d4SSatish Balay             v1 += 2;
34489371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34499371c9d4SSatish Balay             v2 += 2;
34509371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34519371c9d4SSatish Balay             v3 += 2;
34529371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34539371c9d4SSatish Balay             v4 += 2;
34545850ef23SBarry Smith           }
3455d876e2b0SMark Adams           if (n == sz - 1) {
3456d876e2b0SMark Adams             tmp0 = x[*idx++];
3457d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3458d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3459d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3460d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
34619371c9d4SSatish Balay             v1++;
34629371c9d4SSatish Balay             v2++;
34639371c9d4SSatish Balay             v3++;
34649371c9d4SSatish Balay             v4++;
3465d876e2b0SMark Adams           }
3466d876e2b0SMark Adams           t[row]     = sum1;
3467d876e2b0SMark Adams           t[row + 1] = sum2;
3468d876e2b0SMark Adams           t[row + 2] = sum3;
3469d876e2b0SMark Adams           t[row + 3] = sum4;
3470d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 4;
3471d876e2b0SMark Adams           idx        = a->j + diag[row] + 4;
3472d876e2b0SMark Adams           v1 += 4;
3473d876e2b0SMark Adams           v2 += 4;
3474d876e2b0SMark Adams           v3 += 4;
3475d876e2b0SMark Adams           v4 += 4;
3476d876e2b0SMark Adams           for (n = 0; n < sz - 1; n += 2) {
3477d876e2b0SMark Adams             i1 = idx[0];
3478d876e2b0SMark Adams             i2 = idx[1];
3479d876e2b0SMark Adams             idx += 2;
3480d876e2b0SMark Adams             tmp0 = x[i1];
3481d876e2b0SMark Adams             tmp1 = x[i2];
34829371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34839371c9d4SSatish Balay             v1 += 2;
34849371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34859371c9d4SSatish Balay             v2 += 2;
34869371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34879371c9d4SSatish Balay             v3 += 2;
34889371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34899371c9d4SSatish Balay             v4 += 2;
3490d876e2b0SMark Adams           }
34915850ef23SBarry Smith           if (n == sz - 1) {
34925850ef23SBarry Smith             tmp0 = x[*idx];
34935850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
34945850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
34955850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
34965850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
34975850ef23SBarry Smith           }
3498d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3499d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3500d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3501d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
35025850ef23SBarry Smith           break;
35035850ef23SBarry Smith         case 5:
35045850ef23SBarry Smith           v2   = a->a + ii[row + 1];
35055850ef23SBarry Smith           v3   = a->a + ii[row + 2];
35065850ef23SBarry Smith           v4   = a->a + ii[row + 3];
35075850ef23SBarry Smith           v5   = a->a + ii[row + 4];
35085850ef23SBarry Smith           sum1 = b[row];
35095850ef23SBarry Smith           sum2 = b[row + 1];
35105850ef23SBarry Smith           sum3 = b[row + 2];
35115850ef23SBarry Smith           sum4 = b[row + 3];
35125850ef23SBarry Smith           sum5 = b[row + 4];
35135850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35145850ef23SBarry Smith             i1 = idx[0];
35155850ef23SBarry Smith             i2 = idx[1];
35165850ef23SBarry Smith             idx += 2;
35175850ef23SBarry Smith             tmp0 = x[i1];
35185850ef23SBarry Smith             tmp1 = x[i2];
35199371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35209371c9d4SSatish Balay             v1 += 2;
35219371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35229371c9d4SSatish Balay             v2 += 2;
35239371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35249371c9d4SSatish Balay             v3 += 2;
35259371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35269371c9d4SSatish Balay             v4 += 2;
35279371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35289371c9d4SSatish Balay             v5 += 2;
35295850ef23SBarry Smith           }
35305850ef23SBarry Smith           if (n == sz - 1) {
3531d876e2b0SMark Adams             tmp0 = x[*idx++];
35325850ef23SBarry Smith             sum1 -= v1[0] * tmp0;
35335850ef23SBarry Smith             sum2 -= v2[0] * tmp0;
35345850ef23SBarry Smith             sum3 -= v3[0] * tmp0;
35355850ef23SBarry Smith             sum4 -= v4[0] * tmp0;
35365850ef23SBarry Smith             sum5 -= v5[0] * tmp0;
35379371c9d4SSatish Balay             v1++;
35389371c9d4SSatish Balay             v2++;
35399371c9d4SSatish Balay             v3++;
35409371c9d4SSatish Balay             v4++;
35419371c9d4SSatish Balay             v5++;
35425850ef23SBarry Smith           }
3543d876e2b0SMark Adams           t[row]     = sum1;
3544d876e2b0SMark Adams           t[row + 1] = sum2;
3545d876e2b0SMark Adams           t[row + 2] = sum3;
3546d876e2b0SMark Adams           t[row + 3] = sum4;
3547d876e2b0SMark Adams           t[row + 4] = sum5;
3548d876e2b0SMark Adams           sz         = ii[row + 1] - diag[row] - 5;
3549d876e2b0SMark Adams           idx        = a->j + diag[row] + 5;
3550d876e2b0SMark Adams           v1 += 5;
3551d876e2b0SMark Adams           v2 += 5;
3552d876e2b0SMark Adams           v3 += 5;
3553d876e2b0SMark Adams           v4 += 5;
3554d876e2b0SMark Adams           v5 += 5;
35555850ef23SBarry Smith           for (n = 0; n < sz - 1; n += 2) {
35565850ef23SBarry Smith             i1 = idx[0];
35575850ef23SBarry Smith             i2 = idx[1];
35585850ef23SBarry Smith             idx += 2;
35595850ef23SBarry Smith             tmp0 = x[i1];
35605850ef23SBarry Smith             tmp1 = x[i2];
35619371c9d4SSatish Balay             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35629371c9d4SSatish Balay             v1 += 2;
35639371c9d4SSatish Balay             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35649371c9d4SSatish Balay             v2 += 2;
35659371c9d4SSatish Balay             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35669371c9d4SSatish Balay             v3 += 2;
35679371c9d4SSatish Balay             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35689371c9d4SSatish Balay             v4 += 2;
35699371c9d4SSatish Balay             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35709371c9d4SSatish Balay             v5 += 2;
35715850ef23SBarry Smith           }
35725850ef23SBarry Smith           if (n == sz - 1) {
35735850ef23SBarry Smith             tmp0 = x[*idx];
3574d876e2b0SMark Adams             sum1 -= v1[0] * tmp0;
3575d876e2b0SMark Adams             sum2 -= v2[0] * tmp0;
3576d876e2b0SMark Adams             sum3 -= v3[0] * tmp0;
3577d876e2b0SMark Adams             sum4 -= v4[0] * tmp0;
3578d876e2b0SMark Adams             sum5 -= v5[0] * tmp0;
35795850ef23SBarry Smith           }
3580d876e2b0SMark Adams           x[row]     = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3581d876e2b0SMark Adams           x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3582d876e2b0SMark Adams           x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3583d876e2b0SMark Adams           x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3584d876e2b0SMark Adams           x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3585d876e2b0SMark Adams           break;
3586d71ae5a4SJacob Faibussowitsch         default:
3587d71ae5a4SJacob Faibussowitsch           SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3588d876e2b0SMark Adams         }
3589d876e2b0SMark Adams       }
3590d876e2b0SMark Adams       xb = t;
35919566063dSJacob Faibussowitsch       PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3592d876e2b0SMark Adams     } else xb = b;
3593d876e2b0SMark Adams 
3594d876e2b0SMark Adams     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3595d876e2b0SMark Adams       ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3596d876e2b0SMark Adams       for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
3597d876e2b0SMark Adams         ibdiag -= sizes[i] * sizes[i];
3598d876e2b0SMark Adams 
3599d876e2b0SMark Adams         /* set RHS */
3600d876e2b0SMark Adams         if (xb == b) {
3601d876e2b0SMark Adams           /* whole (old way) */
3602d876e2b0SMark Adams           sz  = ii[row + 1] - ii[row];
3603d876e2b0SMark Adams           idx = a->j + ii[row];
3604d876e2b0SMark Adams           switch (sizes[i]) {
3605d71ae5a4SJacob Faibussowitsch           case 5:
3606d71ae5a4SJacob Faibussowitsch             v5 = a->a + ii[row - 4]; /* fall through */
3607d71ae5a4SJacob Faibussowitsch           case 4:
3608d71ae5a4SJacob Faibussowitsch             v4 = a->a + ii[row - 3]; /* fall through */
3609d71ae5a4SJacob Faibussowitsch           case 3:
3610d71ae5a4SJacob Faibussowitsch             v3 = a->a + ii[row - 2]; /* fall through */
3611d71ae5a4SJacob Faibussowitsch           case 2:
3612d71ae5a4SJacob Faibussowitsch             v2 = a->a + ii[row - 1]; /* fall through */
3613d71ae5a4SJacob Faibussowitsch           case 1:
3614d71ae5a4SJacob Faibussowitsch             v1 = a->a + ii[row];
3615d71ae5a4SJacob Faibussowitsch             break;
3616d71ae5a4SJacob Faibussowitsch           default:
3617d71ae5a4SJacob Faibussowitsch             SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
3618d876e2b0SMark Adams           }
3619d876e2b0SMark Adams         } else {
3620d876e2b0SMark Adams           /* upper, no diag */
3621d876e2b0SMark Adams           sz  = ii[row + 1] - diag[row] - 1;
3622d876e2b0SMark Adams           idx = a->j + diag[row] + 1;
3623d876e2b0SMark Adams           switch (sizes[i]) {
3624d71ae5a4SJacob Faibussowitsch           case 5:
3625d71ae5a4SJacob Faibussowitsch             v5 = a->a + diag[row - 4] + 5; /* fall through */
3626d71ae5a4SJacob Faibussowitsch           case 4:
3627d71ae5a4SJacob Faibussowitsch             v4 = a->a + diag[row - 3] + 4; /* fall through */
3628d71ae5a4SJacob Faibussowitsch           case 3:
3629d71ae5a4SJacob Faibussowitsch             v3 = a->a + diag[row - 2] + 3; /* fall through */
3630d71ae5a4SJacob Faibussowitsch           case 2:
3631d71ae5a4SJacob Faibussowitsch             v2 = a->a + diag[row - 1] + 2; /* fall through */
3632d71ae5a4SJacob Faibussowitsch           case 1:
3633d71ae5a4SJacob Faibussowitsch             v1 = a->a + diag[row] + 1;
3634d876e2b0SMark Adams           }
3635d876e2b0SMark Adams         }
3636d876e2b0SMark Adams         /* set sum */
3637d876e2b0SMark Adams         switch (sizes[i]) {
3638d71ae5a4SJacob Faibussowitsch         case 5:
3639d71ae5a4SJacob Faibussowitsch           sum5 = xb[row - 4]; /* fall through */
3640d71ae5a4SJacob Faibussowitsch         case 4:
3641d71ae5a4SJacob Faibussowitsch           sum4 = xb[row - 3]; /* fall through */
3642d71ae5a4SJacob Faibussowitsch         case 3:
3643d71ae5a4SJacob Faibussowitsch           sum3 = xb[row - 2]; /* fall through */
3644d71ae5a4SJacob Faibussowitsch         case 2:
3645d71ae5a4SJacob Faibussowitsch           sum2 = xb[row - 1]; /* fall through */
3646d876e2b0SMark Adams         case 1:
3647d876e2b0SMark Adams           /* note that sum1 is associated with the last row */
3648d876e2b0SMark Adams           sum1 = xb[row];
3649d876e2b0SMark Adams         }
3650d876e2b0SMark Adams         /* do sums */
3651d876e2b0SMark Adams         for (n = 0; n < sz - 1; n += 2) {
3652d876e2b0SMark Adams           i1 = idx[0];
3653d876e2b0SMark Adams           i2 = idx[1];
3654d876e2b0SMark Adams           idx += 2;
3655d876e2b0SMark Adams           tmp0 = x[i1];
3656d876e2b0SMark Adams           tmp1 = x[i2];
3657d876e2b0SMark Adams           switch (sizes[i]) {
3658d71ae5a4SJacob Faibussowitsch           case 5:
3659d71ae5a4SJacob Faibussowitsch             sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3660d71ae5a4SJacob Faibussowitsch             v5 += 2; /* fall through */
3661d71ae5a4SJacob Faibussowitsch           case 4:
3662d71ae5a4SJacob Faibussowitsch             sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3663d71ae5a4SJacob Faibussowitsch             v4 += 2; /* fall through */
3664d71ae5a4SJacob Faibussowitsch           case 3:
3665d71ae5a4SJacob Faibussowitsch             sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3666d71ae5a4SJacob Faibussowitsch             v3 += 2; /* fall through */
3667d71ae5a4SJacob Faibussowitsch           case 2:
3668d71ae5a4SJacob Faibussowitsch             sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3669d71ae5a4SJacob Faibussowitsch             v2 += 2; /* fall through */
3670d71ae5a4SJacob Faibussowitsch           case 1:
3671d71ae5a4SJacob Faibussowitsch             sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3672d71ae5a4SJacob Faibussowitsch             v1 += 2;
3673d876e2b0SMark Adams           }
3674d876e2b0SMark Adams         }
3675d876e2b0SMark Adams         /* ragged edge */
3676d876e2b0SMark Adams         if (n == sz - 1) {
3677d876e2b0SMark Adams           tmp0 = x[*idx];
3678d876e2b0SMark Adams           switch (sizes[i]) {
3679d71ae5a4SJacob Faibussowitsch           case 5:
3680d71ae5a4SJacob Faibussowitsch             sum5 -= *v5 * tmp0; /* fall through */
3681d71ae5a4SJacob Faibussowitsch           case 4:
3682d71ae5a4SJacob Faibussowitsch             sum4 -= *v4 * tmp0; /* fall through */
3683d71ae5a4SJacob Faibussowitsch           case 3:
3684d71ae5a4SJacob Faibussowitsch             sum3 -= *v3 * tmp0; /* fall through */
3685d71ae5a4SJacob Faibussowitsch           case 2:
3686d71ae5a4SJacob Faibussowitsch             sum2 -= *v2 * tmp0; /* fall through */
3687d71ae5a4SJacob Faibussowitsch           case 1:
3688d71ae5a4SJacob Faibussowitsch             sum1 -= *v1 * tmp0;
3689d876e2b0SMark Adams           }
3690d876e2b0SMark Adams         }
3691d876e2b0SMark Adams         /* update */
3692d876e2b0SMark Adams         if (xb == b) {
3693d876e2b0SMark Adams           /* whole (old way) w/ diag */
3694d876e2b0SMark Adams           switch (sizes[i]) {
3695d876e2b0SMark Adams           case 5:
36965850ef23SBarry Smith             x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
36975850ef23SBarry Smith             x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
36985850ef23SBarry Smith             x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
36995850ef23SBarry Smith             x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
37005850ef23SBarry Smith             x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
37015850ef23SBarry Smith             break;
3702d876e2b0SMark Adams           case 4:
3703d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3704d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3705d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3706d876e2b0SMark Adams             x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3707d876e2b0SMark Adams             break;
3708d876e2b0SMark Adams           case 3:
3709d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3710d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3711d876e2b0SMark Adams             x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3712d876e2b0SMark Adams             break;
3713d876e2b0SMark Adams           case 2:
3714d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3715d876e2b0SMark Adams             x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3716d876e2b0SMark Adams             break;
3717d71ae5a4SJacob Faibussowitsch           case 1:
3718d71ae5a4SJacob Faibussowitsch             x[row--] += sum1 * (*ibdiag);
3719d71ae5a4SJacob Faibussowitsch             break;
3720d876e2b0SMark Adams           }
3721d876e2b0SMark Adams         } else {
3722d876e2b0SMark Adams           /* no diag so set =  */
3723d876e2b0SMark Adams           switch (sizes[i]) {
3724d876e2b0SMark Adams           case 5:
3725d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3726d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3727d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3728d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3729d876e2b0SMark Adams             x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3730d876e2b0SMark Adams             break;
3731d876e2b0SMark Adams           case 4:
3732d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3733d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3734d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3735d876e2b0SMark Adams             x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3736d876e2b0SMark Adams             break;
3737d876e2b0SMark Adams           case 3:
3738d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3739d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3740d876e2b0SMark Adams             x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3741d876e2b0SMark Adams             break;
3742d876e2b0SMark Adams           case 2:
3743d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3744d876e2b0SMark Adams             x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3745d876e2b0SMark Adams             break;
3746d71ae5a4SJacob Faibussowitsch           case 1:
3747d71ae5a4SJacob Faibussowitsch             x[row--] = sum1 * (*ibdiag);
3748d71ae5a4SJacob Faibussowitsch             break;
37495850ef23SBarry Smith           }
37505850ef23SBarry Smith         }
3751d876e2b0SMark Adams       }
3752d876e2b0SMark Adams       if (xb == b) {
37539566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(2.0 * a->nz));
3754d876e2b0SMark Adams       } else {
37559566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3756d876e2b0SMark Adams       }
37575850ef23SBarry Smith     }
37582af78befSBarry Smith   }
375989c6957cSBarry Smith   if (flag & SOR_EISENSTAT) {
376089c6957cSBarry Smith     /*
376189c6957cSBarry Smith           Apply  (U + D)^-1  where D is now the block diagonal
376289c6957cSBarry Smith     */
376389c6957cSBarry Smith     ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
376489c6957cSBarry Smith     for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
376589c6957cSBarry Smith       ibdiag -= sizes[i] * sizes[i];
376689c6957cSBarry Smith       sz  = ii[row + 1] - diag[row] - 1;
376789c6957cSBarry Smith       v1  = a->a + diag[row] + 1;
376889c6957cSBarry Smith       idx = a->j + diag[row] + 1;
37694108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
377089c6957cSBarry Smith       switch (sizes[i]) {
377189c6957cSBarry Smith       case 1:
377289c6957cSBarry Smith 
377389c6957cSBarry Smith         sum1 = b[row];
377489c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
377589c6957cSBarry Smith           i1 = idx[0];
377689c6957cSBarry Smith           i2 = idx[1];
377789c6957cSBarry Smith           idx += 2;
377889c6957cSBarry Smith           tmp0 = x[i1];
377989c6957cSBarry Smith           tmp1 = x[i2];
37809371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37819371c9d4SSatish Balay           v1 += 2;
378289c6957cSBarry Smith         }
378389c6957cSBarry Smith 
378489c6957cSBarry Smith         if (n == sz - 1) {
378589c6957cSBarry Smith           tmp0 = x[*idx];
378689c6957cSBarry Smith           sum1 -= *v1 * tmp0;
378789c6957cSBarry Smith         }
37889371c9d4SSatish Balay         x[row] = sum1 * (*ibdiag);
37899371c9d4SSatish Balay         row--;
379089c6957cSBarry Smith         break;
379189c6957cSBarry Smith 
379289c6957cSBarry Smith       case 2:
379389c6957cSBarry Smith 
379489c6957cSBarry Smith         sum1 = b[row];
379589c6957cSBarry Smith         sum2 = b[row - 1];
379689c6957cSBarry Smith         /* note that sum1 is associated with the second of the two rows */
379789c6957cSBarry Smith         v2 = a->a + diag[row - 1] + 2;
379889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
379989c6957cSBarry Smith           i1 = idx[0];
380089c6957cSBarry Smith           i2 = idx[1];
380189c6957cSBarry Smith           idx += 2;
380289c6957cSBarry Smith           tmp0 = x[i1];
380389c6957cSBarry Smith           tmp1 = x[i2];
38049371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38059371c9d4SSatish Balay           v1 += 2;
38069371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38079371c9d4SSatish Balay           v2 += 2;
380889c6957cSBarry Smith         }
380989c6957cSBarry Smith 
381089c6957cSBarry Smith         if (n == sz - 1) {
381189c6957cSBarry Smith           tmp0 = x[*idx];
381289c6957cSBarry Smith           sum1 -= *v1 * tmp0;
381389c6957cSBarry Smith           sum2 -= *v2 * tmp0;
381489c6957cSBarry Smith         }
3815938d4eb3SBarry Smith         x[row]     = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3816938d4eb3SBarry Smith         x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3817938d4eb3SBarry Smith         row -= 2;
381889c6957cSBarry Smith         break;
381989c6957cSBarry Smith       case 3:
382089c6957cSBarry Smith 
382189c6957cSBarry Smith         sum1 = b[row];
382289c6957cSBarry Smith         sum2 = b[row - 1];
382389c6957cSBarry Smith         sum3 = b[row - 2];
382489c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
382589c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
382689c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
382789c6957cSBarry Smith           i1 = idx[0];
382889c6957cSBarry Smith           i2 = idx[1];
382989c6957cSBarry Smith           idx += 2;
383089c6957cSBarry Smith           tmp0 = x[i1];
383189c6957cSBarry Smith           tmp1 = x[i2];
38329371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38339371c9d4SSatish Balay           v1 += 2;
38349371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38359371c9d4SSatish Balay           v2 += 2;
38369371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38379371c9d4SSatish Balay           v3 += 2;
383889c6957cSBarry Smith         }
383989c6957cSBarry Smith 
384089c6957cSBarry Smith         if (n == sz - 1) {
384189c6957cSBarry Smith           tmp0 = x[*idx];
384289c6957cSBarry Smith           sum1 -= *v1 * tmp0;
384389c6957cSBarry Smith           sum2 -= *v2 * tmp0;
384489c6957cSBarry Smith           sum3 -= *v3 * tmp0;
384589c6957cSBarry Smith         }
3846938d4eb3SBarry Smith         x[row]     = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3847938d4eb3SBarry Smith         x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3848938d4eb3SBarry Smith         x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3849938d4eb3SBarry Smith         row -= 3;
385089c6957cSBarry Smith         break;
385189c6957cSBarry Smith       case 4:
385289c6957cSBarry Smith 
385389c6957cSBarry Smith         sum1 = b[row];
385489c6957cSBarry Smith         sum2 = b[row - 1];
385589c6957cSBarry Smith         sum3 = b[row - 2];
385689c6957cSBarry Smith         sum4 = b[row - 3];
385789c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
385889c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
385989c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
386089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
386189c6957cSBarry Smith           i1 = idx[0];
386289c6957cSBarry Smith           i2 = idx[1];
386389c6957cSBarry Smith           idx += 2;
386489c6957cSBarry Smith           tmp0 = x[i1];
386589c6957cSBarry Smith           tmp1 = x[i2];
38669371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
38679371c9d4SSatish Balay           v1 += 2;
38689371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
38699371c9d4SSatish Balay           v2 += 2;
38709371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
38719371c9d4SSatish Balay           v3 += 2;
38729371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
38739371c9d4SSatish Balay           v4 += 2;
387489c6957cSBarry Smith         }
387589c6957cSBarry Smith 
387689c6957cSBarry Smith         if (n == sz - 1) {
387789c6957cSBarry Smith           tmp0 = x[*idx];
387889c6957cSBarry Smith           sum1 -= *v1 * tmp0;
387989c6957cSBarry Smith           sum2 -= *v2 * tmp0;
388089c6957cSBarry Smith           sum3 -= *v3 * tmp0;
388189c6957cSBarry Smith           sum4 -= *v4 * tmp0;
388289c6957cSBarry Smith         }
3883938d4eb3SBarry Smith         x[row]     = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3884938d4eb3SBarry Smith         x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3885938d4eb3SBarry Smith         x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3886938d4eb3SBarry Smith         x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3887938d4eb3SBarry Smith         row -= 4;
388889c6957cSBarry Smith         break;
388989c6957cSBarry Smith       case 5:
389089c6957cSBarry Smith 
389189c6957cSBarry Smith         sum1 = b[row];
389289c6957cSBarry Smith         sum2 = b[row - 1];
389389c6957cSBarry Smith         sum3 = b[row - 2];
389489c6957cSBarry Smith         sum4 = b[row - 3];
389589c6957cSBarry Smith         sum5 = b[row - 4];
389689c6957cSBarry Smith         v2   = a->a + diag[row - 1] + 2;
389789c6957cSBarry Smith         v3   = a->a + diag[row - 2] + 3;
389889c6957cSBarry Smith         v4   = a->a + diag[row - 3] + 4;
389989c6957cSBarry Smith         v5   = a->a + diag[row - 4] + 5;
390089c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
390189c6957cSBarry Smith           i1 = idx[0];
390289c6957cSBarry Smith           i2 = idx[1];
390389c6957cSBarry Smith           idx += 2;
390489c6957cSBarry Smith           tmp0 = x[i1];
390589c6957cSBarry Smith           tmp1 = x[i2];
39069371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
39079371c9d4SSatish Balay           v1 += 2;
39089371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
39099371c9d4SSatish Balay           v2 += 2;
39109371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
39119371c9d4SSatish Balay           v3 += 2;
39129371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
39139371c9d4SSatish Balay           v4 += 2;
39149371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
39159371c9d4SSatish Balay           v5 += 2;
391689c6957cSBarry Smith         }
391789c6957cSBarry Smith 
391889c6957cSBarry Smith         if (n == sz - 1) {
391989c6957cSBarry Smith           tmp0 = x[*idx];
392089c6957cSBarry Smith           sum1 -= *v1 * tmp0;
392189c6957cSBarry Smith           sum2 -= *v2 * tmp0;
392289c6957cSBarry Smith           sum3 -= *v3 * tmp0;
392389c6957cSBarry Smith           sum4 -= *v4 * tmp0;
392489c6957cSBarry Smith           sum5 -= *v5 * tmp0;
392589c6957cSBarry Smith         }
3926938d4eb3SBarry Smith         x[row]     = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3927938d4eb3SBarry Smith         x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3928938d4eb3SBarry Smith         x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3929938d4eb3SBarry Smith         x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3930938d4eb3SBarry Smith         x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3931938d4eb3SBarry Smith         row -= 5;
393289c6957cSBarry Smith         break;
3933d71ae5a4SJacob Faibussowitsch       default:
3934d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
393589c6957cSBarry Smith       }
393689c6957cSBarry Smith     }
39379566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
393889c6957cSBarry Smith 
393989c6957cSBarry Smith     /*
394089c6957cSBarry Smith            t = b - D x    where D is the block diagonal
394189c6957cSBarry Smith     */
394289c6957cSBarry Smith     cnt = 0;
394389c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
394489c6957cSBarry Smith       switch (sizes[i]) {
394589c6957cSBarry Smith       case 1:
39469371c9d4SSatish Balay         t[row] = b[row] - bdiag[cnt++] * x[row];
39479371c9d4SSatish Balay         row++;
394889c6957cSBarry Smith         break;
394989c6957cSBarry Smith       case 2:
39509371c9d4SSatish Balay         x1         = x[row];
39519371c9d4SSatish Balay         x2         = x[row + 1];
395289c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
395389c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
395489c6957cSBarry Smith         t[row]     = b[row] - tmp1;
39559371c9d4SSatish Balay         t[row + 1] = b[row + 1] - tmp2;
39569371c9d4SSatish Balay         row += 2;
395789c6957cSBarry Smith         cnt += 4;
395889c6957cSBarry Smith         break;
395989c6957cSBarry Smith       case 3:
39609371c9d4SSatish Balay         x1         = x[row];
39619371c9d4SSatish Balay         x2         = x[row + 1];
39629371c9d4SSatish Balay         x3         = x[row + 2];
396389c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
396489c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
396589c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
396689c6957cSBarry Smith         t[row]     = b[row] - tmp1;
396789c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
39689371c9d4SSatish Balay         t[row + 2] = b[row + 2] - tmp3;
39699371c9d4SSatish Balay         row += 3;
397089c6957cSBarry Smith         cnt += 9;
397189c6957cSBarry Smith         break;
397289c6957cSBarry Smith       case 4:
39739371c9d4SSatish Balay         x1         = x[row];
39749371c9d4SSatish Balay         x2         = x[row + 1];
39759371c9d4SSatish Balay         x3         = x[row + 2];
39769371c9d4SSatish Balay         x4         = x[row + 3];
397789c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
397889c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
397989c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
398089c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
398189c6957cSBarry Smith         t[row]     = b[row] - tmp1;
398289c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
398389c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
39849371c9d4SSatish Balay         t[row + 3] = b[row + 3] - tmp4;
39859371c9d4SSatish Balay         row += 4;
398689c6957cSBarry Smith         cnt += 16;
398789c6957cSBarry Smith         break;
398889c6957cSBarry Smith       case 5:
39899371c9d4SSatish Balay         x1         = x[row];
39909371c9d4SSatish Balay         x2         = x[row + 1];
39919371c9d4SSatish Balay         x3         = x[row + 2];
39929371c9d4SSatish Balay         x4         = x[row + 3];
39939371c9d4SSatish Balay         x5         = x[row + 4];
399489c6957cSBarry Smith         tmp1       = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
399589c6957cSBarry Smith         tmp2       = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
399689c6957cSBarry Smith         tmp3       = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
399789c6957cSBarry Smith         tmp4       = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
399889c6957cSBarry Smith         tmp5       = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
399989c6957cSBarry Smith         t[row]     = b[row] - tmp1;
400089c6957cSBarry Smith         t[row + 1] = b[row + 1] - tmp2;
400189c6957cSBarry Smith         t[row + 2] = b[row + 2] - tmp3;
400289c6957cSBarry Smith         t[row + 3] = b[row + 3] - tmp4;
40039371c9d4SSatish Balay         t[row + 4] = b[row + 4] - tmp5;
40049371c9d4SSatish Balay         row += 5;
400589c6957cSBarry Smith         cnt += 25;
400689c6957cSBarry Smith         break;
4007d71ae5a4SJacob Faibussowitsch       default:
4008d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
400989c6957cSBarry Smith       }
401089c6957cSBarry Smith     }
40119566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(m));
401289c6957cSBarry Smith 
401389c6957cSBarry Smith     /*
401489c6957cSBarry Smith           Apply (L + D)^-1 where D is the block diagonal
401589c6957cSBarry Smith     */
401689c6957cSBarry Smith     for (i = 0, row = 0; i < m; i++) {
401789c6957cSBarry Smith       sz  = diag[row] - ii[row];
401889c6957cSBarry Smith       v1  = a->a + ii[row];
401989c6957cSBarry Smith       idx = a->j + ii[row];
40204108e4d5SBarry Smith       /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
402189c6957cSBarry Smith       switch (sizes[i]) {
402289c6957cSBarry Smith       case 1:
402389c6957cSBarry Smith 
402489c6957cSBarry Smith         sum1 = t[row];
402589c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
402689c6957cSBarry Smith           i1 = idx[0];
402789c6957cSBarry Smith           i2 = idx[1];
402889c6957cSBarry Smith           idx += 2;
402989c6957cSBarry Smith           tmp0 = t[i1];
403089c6957cSBarry Smith           tmp1 = t[i2];
40319371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40329371c9d4SSatish Balay           v1 += 2;
403389c6957cSBarry Smith         }
403489c6957cSBarry Smith 
403589c6957cSBarry Smith         if (n == sz - 1) {
403689c6957cSBarry Smith           tmp0 = t[*idx];
403789c6957cSBarry Smith           sum1 -= *v1 * tmp0;
403889c6957cSBarry Smith         }
40399371c9d4SSatish Balay         x[row] += t[row] = sum1 * (*ibdiag++);
40409371c9d4SSatish Balay         row++;
404189c6957cSBarry Smith         break;
404289c6957cSBarry Smith       case 2:
404389c6957cSBarry Smith         v2   = a->a + ii[row + 1];
404489c6957cSBarry Smith         sum1 = t[row];
404589c6957cSBarry Smith         sum2 = t[row + 1];
404689c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
404789c6957cSBarry Smith           i1 = idx[0];
404889c6957cSBarry Smith           i2 = idx[1];
404989c6957cSBarry Smith           idx += 2;
405089c6957cSBarry Smith           tmp0 = t[i1];
405189c6957cSBarry Smith           tmp1 = t[i2];
40529371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40539371c9d4SSatish Balay           v1 += 2;
40549371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40559371c9d4SSatish Balay           v2 += 2;
405689c6957cSBarry Smith         }
405789c6957cSBarry Smith 
405889c6957cSBarry Smith         if (n == sz - 1) {
405989c6957cSBarry Smith           tmp0 = t[*idx];
406089c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
406189c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
406289c6957cSBarry Smith         }
406389c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[2];
406489c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
40659371c9d4SSatish Balay         ibdiag += 4;
40669371c9d4SSatish Balay         row += 2;
406789c6957cSBarry Smith         break;
406889c6957cSBarry Smith       case 3:
406989c6957cSBarry Smith         v2   = a->a + ii[row + 1];
407089c6957cSBarry Smith         v3   = a->a + ii[row + 2];
407189c6957cSBarry Smith         sum1 = t[row];
407289c6957cSBarry Smith         sum2 = t[row + 1];
407389c6957cSBarry Smith         sum3 = t[row + 2];
407489c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
407589c6957cSBarry Smith           i1 = idx[0];
407689c6957cSBarry Smith           i2 = idx[1];
407789c6957cSBarry Smith           idx += 2;
407889c6957cSBarry Smith           tmp0 = t[i1];
407989c6957cSBarry Smith           tmp1 = t[i2];
40809371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
40819371c9d4SSatish Balay           v1 += 2;
40829371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
40839371c9d4SSatish Balay           v2 += 2;
40849371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
40859371c9d4SSatish Balay           v3 += 2;
408689c6957cSBarry Smith         }
408789c6957cSBarry Smith 
408889c6957cSBarry Smith         if (n == sz - 1) {
408989c6957cSBarry Smith           tmp0 = t[*idx];
409089c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
409189c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
409289c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
409389c6957cSBarry Smith         }
409489c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
409589c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
409689c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
40979371c9d4SSatish Balay         ibdiag += 9;
40989371c9d4SSatish Balay         row += 3;
409989c6957cSBarry Smith         break;
410089c6957cSBarry Smith       case 4:
410189c6957cSBarry Smith         v2   = a->a + ii[row + 1];
410289c6957cSBarry Smith         v3   = a->a + ii[row + 2];
410389c6957cSBarry Smith         v4   = a->a + ii[row + 3];
410489c6957cSBarry Smith         sum1 = t[row];
410589c6957cSBarry Smith         sum2 = t[row + 1];
410689c6957cSBarry Smith         sum3 = t[row + 2];
410789c6957cSBarry Smith         sum4 = t[row + 3];
410889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
410989c6957cSBarry Smith           i1 = idx[0];
411089c6957cSBarry Smith           i2 = idx[1];
411189c6957cSBarry Smith           idx += 2;
411289c6957cSBarry Smith           tmp0 = t[i1];
411389c6957cSBarry Smith           tmp1 = t[i2];
41149371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41159371c9d4SSatish Balay           v1 += 2;
41169371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41179371c9d4SSatish Balay           v2 += 2;
41189371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41199371c9d4SSatish Balay           v3 += 2;
41209371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41219371c9d4SSatish Balay           v4 += 2;
412289c6957cSBarry Smith         }
412389c6957cSBarry Smith 
412489c6957cSBarry Smith         if (n == sz - 1) {
412589c6957cSBarry Smith           tmp0 = t[*idx];
412689c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
412789c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
412889c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
412989c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
413089c6957cSBarry Smith         }
413189c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
413289c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
413389c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
413489c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
41359371c9d4SSatish Balay         ibdiag += 16;
41369371c9d4SSatish Balay         row += 4;
413789c6957cSBarry Smith         break;
413889c6957cSBarry Smith       case 5:
413989c6957cSBarry Smith         v2   = a->a + ii[row + 1];
414089c6957cSBarry Smith         v3   = a->a + ii[row + 2];
414189c6957cSBarry Smith         v4   = a->a + ii[row + 3];
414289c6957cSBarry Smith         v5   = a->a + ii[row + 4];
414389c6957cSBarry Smith         sum1 = t[row];
414489c6957cSBarry Smith         sum2 = t[row + 1];
414589c6957cSBarry Smith         sum3 = t[row + 2];
414689c6957cSBarry Smith         sum4 = t[row + 3];
414789c6957cSBarry Smith         sum5 = t[row + 4];
414889c6957cSBarry Smith         for (n = 0; n < sz - 1; n += 2) {
414989c6957cSBarry Smith           i1 = idx[0];
415089c6957cSBarry Smith           i2 = idx[1];
415189c6957cSBarry Smith           idx += 2;
415289c6957cSBarry Smith           tmp0 = t[i1];
415389c6957cSBarry Smith           tmp1 = t[i2];
41549371c9d4SSatish Balay           sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
41559371c9d4SSatish Balay           v1 += 2;
41569371c9d4SSatish Balay           sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
41579371c9d4SSatish Balay           v2 += 2;
41589371c9d4SSatish Balay           sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
41599371c9d4SSatish Balay           v3 += 2;
41609371c9d4SSatish Balay           sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
41619371c9d4SSatish Balay           v4 += 2;
41629371c9d4SSatish Balay           sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
41639371c9d4SSatish Balay           v5 += 2;
416489c6957cSBarry Smith         }
416589c6957cSBarry Smith 
416689c6957cSBarry Smith         if (n == sz - 1) {
416789c6957cSBarry Smith           tmp0 = t[*idx];
416889c6957cSBarry Smith           sum1 -= v1[0] * tmp0;
416989c6957cSBarry Smith           sum2 -= v2[0] * tmp0;
417089c6957cSBarry Smith           sum3 -= v3[0] * tmp0;
417189c6957cSBarry Smith           sum4 -= v4[0] * tmp0;
417289c6957cSBarry Smith           sum5 -= v5[0] * tmp0;
417389c6957cSBarry Smith         }
417489c6957cSBarry Smith         x[row] += t[row]         = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
417589c6957cSBarry Smith         x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
417689c6957cSBarry Smith         x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
417789c6957cSBarry Smith         x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
417889c6957cSBarry Smith         x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
41799371c9d4SSatish Balay         ibdiag += 25;
41809371c9d4SSatish Balay         row += 5;
418189c6957cSBarry Smith         break;
4182d71ae5a4SJacob Faibussowitsch       default:
4183d71ae5a4SJacob Faibussowitsch         SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
418489c6957cSBarry Smith       }
418589c6957cSBarry Smith     }
41869566063dSJacob Faibussowitsch     PetscCall(PetscLogFlops(a->nz));
41875850ef23SBarry Smith   }
41889566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
41899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
4190365a8a9eSBarry Smith   PetscFunctionReturn(0);
41912af78befSBarry Smith }
41922af78befSBarry Smith 
4193d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
4194d71ae5a4SJacob Faibussowitsch {
419589c6957cSBarry Smith   Mat_SeqAIJ        *a = (Mat_SeqAIJ *)A->data;
419689c6957cSBarry Smith   PetscScalar       *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
419789c6957cSBarry Smith   const MatScalar   *bdiag = a->inode.bdiag;
419889c6957cSBarry Smith   const PetscScalar *b;
419989c6957cSBarry Smith   PetscInt           m = a->inode.node_count, cnt = 0, i, row;
420089c6957cSBarry Smith   const PetscInt    *sizes = a->inode.size;
42012af78befSBarry Smith 
420289c6957cSBarry Smith   PetscFunctionBegin;
420308401ef6SPierre Jolivet   PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
42049566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42059566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
420689c6957cSBarry Smith   cnt = 0;
420789c6957cSBarry Smith   for (i = 0, row = 0; i < m; i++) {
420889c6957cSBarry Smith     switch (sizes[i]) {
420989c6957cSBarry Smith     case 1:
42109371c9d4SSatish Balay       x[row] = b[row] * bdiag[cnt++];
42119371c9d4SSatish Balay       row++;
421289c6957cSBarry Smith       break;
421389c6957cSBarry Smith     case 2:
42149371c9d4SSatish Balay       x1       = b[row];
42159371c9d4SSatish Balay       x2       = b[row + 1];
421689c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
421789c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
421889c6957cSBarry Smith       x[row++] = tmp1;
421989c6957cSBarry Smith       x[row++] = tmp2;
422089c6957cSBarry Smith       cnt += 4;
422189c6957cSBarry Smith       break;
422289c6957cSBarry Smith     case 3:
42239371c9d4SSatish Balay       x1       = b[row];
42249371c9d4SSatish Balay       x2       = b[row + 1];
42259371c9d4SSatish Balay       x3       = b[row + 2];
422689c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
422789c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
422889c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
422989c6957cSBarry Smith       x[row++] = tmp1;
423089c6957cSBarry Smith       x[row++] = tmp2;
423189c6957cSBarry Smith       x[row++] = tmp3;
423289c6957cSBarry Smith       cnt += 9;
423389c6957cSBarry Smith       break;
423489c6957cSBarry Smith     case 4:
42359371c9d4SSatish Balay       x1       = b[row];
42369371c9d4SSatish Balay       x2       = b[row + 1];
42379371c9d4SSatish Balay       x3       = b[row + 2];
42389371c9d4SSatish Balay       x4       = b[row + 3];
423989c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
424089c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
424189c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
424289c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
424389c6957cSBarry Smith       x[row++] = tmp1;
424489c6957cSBarry Smith       x[row++] = tmp2;
424589c6957cSBarry Smith       x[row++] = tmp3;
424689c6957cSBarry Smith       x[row++] = tmp4;
424789c6957cSBarry Smith       cnt += 16;
424889c6957cSBarry Smith       break;
424989c6957cSBarry Smith     case 5:
42509371c9d4SSatish Balay       x1       = b[row];
42519371c9d4SSatish Balay       x2       = b[row + 1];
42529371c9d4SSatish Balay       x3       = b[row + 2];
42539371c9d4SSatish Balay       x4       = b[row + 3];
42549371c9d4SSatish Balay       x5       = b[row + 4];
425589c6957cSBarry Smith       tmp1     = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
425689c6957cSBarry Smith       tmp2     = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
425789c6957cSBarry Smith       tmp3     = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
425889c6957cSBarry Smith       tmp4     = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
425989c6957cSBarry Smith       tmp5     = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
426089c6957cSBarry Smith       x[row++] = tmp1;
426189c6957cSBarry Smith       x[row++] = tmp2;
426289c6957cSBarry Smith       x[row++] = tmp3;
426389c6957cSBarry Smith       x[row++] = tmp4;
426489c6957cSBarry Smith       x[row++] = tmp5;
426589c6957cSBarry Smith       cnt += 25;
426689c6957cSBarry Smith       break;
4267d71ae5a4SJacob Faibussowitsch     default:
4268d71ae5a4SJacob Faibussowitsch       SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]);
426989c6957cSBarry Smith     }
427089c6957cSBarry Smith   }
42719566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * cnt));
42729566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
42739566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
427489c6957cSBarry Smith   PetscFunctionReturn(0);
427589c6957cSBarry Smith }
427689c6957cSBarry Smith 
4277d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
4278d71ae5a4SJacob Faibussowitsch {
4279b215bc84SStefano Zampini   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4280b215bc84SStefano Zampini 
4281b215bc84SStefano Zampini   PetscFunctionBegin;
4282b215bc84SStefano Zampini   a->inode.node_count       = 0;
4283b215bc84SStefano Zampini   a->inode.use              = PETSC_FALSE;
4284b215bc84SStefano Zampini   a->inode.checked          = PETSC_FALSE;
4285b215bc84SStefano Zampini   a->inode.mat_nonzerostate = -1;
4286b215bc84SStefano Zampini   A->ops->getrowij          = MatGetRowIJ_SeqAIJ;
4287b215bc84SStefano Zampini   A->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ;
4288b215bc84SStefano Zampini   A->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ;
4289b215bc84SStefano Zampini   A->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ;
4290b215bc84SStefano Zampini   A->ops->coloringpatch     = NULL;
4291b215bc84SStefano Zampini   A->ops->multdiagonalblock = NULL;
4292ad540459SPierre Jolivet   if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
4293b215bc84SStefano Zampini   PetscFunctionReturn(0);
4294b215bc84SStefano Zampini }
4295b215bc84SStefano Zampini 
42964c1414c8SBarry Smith /*
42974c1414c8SBarry Smith     samestructure indicates that the matrix has not changed its nonzero structure so we
42984c1414c8SBarry Smith     do not need to recompute the inodes
42994c1414c8SBarry Smith */
4300d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A)
4301d71ae5a4SJacob Faibussowitsch {
43024c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
43038758e1faSBarry Smith   PetscInt        i, j, m, nzx, nzy, *ns, node_count, blk_size;
4304ace3abfcSBarry Smith   PetscBool       flag;
43058758e1faSBarry Smith   const PetscInt *idx, *idy, *ii;
43064c1414c8SBarry Smith 
43074c1414c8SBarry Smith   PetscFunctionBegin;
4308b215bc84SStefano Zampini   if (!a->inode.use) {
43099566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43109566063dSJacob Faibussowitsch     PetscCall(PetscFree(a->inode.size));
4311b215bc84SStefano Zampini     PetscFunctionReturn(0);
4312b215bc84SStefano Zampini   }
4313a02bda8eSBarry Smith   if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0);
43144c1414c8SBarry Smith 
4315d0f46423SBarry Smith   m = A->rmap->n;
43169566063dSJacob Faibussowitsch   if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size));
4317b215bc84SStefano Zampini   ns = a->inode.size;
43184c1414c8SBarry Smith 
43194c1414c8SBarry Smith   i          = 0;
43204c1414c8SBarry Smith   node_count = 0;
43214c1414c8SBarry Smith   idx        = a->j;
43224c1414c8SBarry Smith   ii         = a->i;
43234c1414c8SBarry Smith   while (i < m) {            /* For each row */
43244c1414c8SBarry Smith     nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
43254c1414c8SBarry Smith     /* Limits the number of elements in a node to 'a->inode.limit' */
43264c1414c8SBarry Smith     for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
43274c1414c8SBarry Smith       nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
43284c1414c8SBarry Smith       if (nzy != nzx) break;
43294c1414c8SBarry Smith       idy += nzx; /* Same nonzero pattern */
43309566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
43314c1414c8SBarry Smith       if (!flag) break;
43324c1414c8SBarry Smith     }
43334c1414c8SBarry Smith     ns[node_count++] = blk_size;
43344c1414c8SBarry Smith     idx += blk_size * nzx;
43354c1414c8SBarry Smith     i = j;
43364c1414c8SBarry Smith   }
43372cb58ee3SKarl Rupp 
43384c1414c8SBarry Smith   /* If not enough inodes found,, do not use inode version of the routines */
4339be6adb11SBarry Smith   if (!m || node_count > .8 * m) {
43409566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJ_Inode_ResetOps(A));
43419566063dSJacob Faibussowitsch     PetscCall(PetscFree(a->inode.size));
43429566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
43434c1414c8SBarry Smith   } else {
4344d5f3da31SBarry Smith     if (!A->factortype) {
4345375a6242SBarry Smith       A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4346375a6242SBarry Smith       if (A->rmap->n == A->cmap->n) {
43474108e4d5SBarry Smith         A->ops->getrowij        = MatGetRowIJ_SeqAIJ_Inode;
43484108e4d5SBarry Smith         A->ops->restorerowij    = MatRestoreRowIJ_SeqAIJ_Inode;
43494108e4d5SBarry Smith         A->ops->getcolumnij     = MatGetColumnIJ_SeqAIJ_Inode;
43504108e4d5SBarry Smith         A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
43514108e4d5SBarry Smith         A->ops->coloringpatch   = MatColoringPatch_SeqAIJ_Inode;
4352375a6242SBarry Smith       }
4353d3ac4fa3SBarry Smith     } else {
4354d3ac4fa3SBarry Smith       A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4355d3ac4fa3SBarry Smith     }
43564c1414c8SBarry Smith     a->inode.node_count = node_count;
43579566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
43584c1414c8SBarry Smith   }
4359be6adb11SBarry Smith   a->inode.checked          = PETSC_TRUE;
4360a02bda8eSBarry Smith   a->inode.mat_nonzerostate = A->nonzerostate;
43614c1414c8SBarry Smith   PetscFunctionReturn(0);
43624c1414c8SBarry Smith }
43634c1414c8SBarry Smith 
4364d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
4365d71ae5a4SJacob Faibussowitsch {
4366150f0143SBarry Smith   Mat         B = *C;
4367150f0143SBarry Smith   Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
4368150f0143SBarry Smith   PetscInt    m = A->rmap->n;
4369150f0143SBarry Smith 
4370150f0143SBarry Smith   PetscFunctionBegin;
4371150f0143SBarry Smith   c->inode.use              = a->inode.use;
4372150f0143SBarry Smith   c->inode.limit            = a->inode.limit;
4373150f0143SBarry Smith   c->inode.max_limit        = a->inode.max_limit;
4374ec710b6aSStefano Zampini   c->inode.checked          = PETSC_FALSE;
4375ec710b6aSStefano Zampini   c->inode.size             = NULL;
4376ec710b6aSStefano Zampini   c->inode.node_count       = 0;
4377ec710b6aSStefano Zampini   c->inode.ibdiagvalid      = PETSC_FALSE;
4378ec710b6aSStefano Zampini   c->inode.ibdiag           = NULL;
4379ec710b6aSStefano Zampini   c->inode.bdiag            = NULL;
4380ec710b6aSStefano Zampini   c->inode.mat_nonzerostate = -1;
4381b215bc84SStefano Zampini   if (a->inode.use) {
4382ec710b6aSStefano Zampini     if (a->inode.checked && a->inode.size) {
43839566063dSJacob Faibussowitsch       PetscCall(PetscMalloc1(m + 1, &c->inode.size));
43849566063dSJacob Faibussowitsch       PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1));
4385ec710b6aSStefano Zampini 
4386ec710b6aSStefano Zampini       c->inode.checked          = PETSC_TRUE;
4387ec710b6aSStefano Zampini       c->inode.node_count       = a->inode.node_count;
4388ec710b6aSStefano Zampini       c->inode.mat_nonzerostate = (*C)->nonzerostate;
4389ec710b6aSStefano Zampini     }
4390a02bda8eSBarry Smith     /* note the table of functions below should match that in MatSeqAIJCheckInode() */
43912c451681SBarry Smith     if (!B->factortype) {
43922c451681SBarry Smith       B->ops->getrowij          = MatGetRowIJ_SeqAIJ_Inode;
43932c451681SBarry Smith       B->ops->restorerowij      = MatRestoreRowIJ_SeqAIJ_Inode;
43942c451681SBarry Smith       B->ops->getcolumnij       = MatGetColumnIJ_SeqAIJ_Inode;
43952c451681SBarry Smith       B->ops->restorecolumnij   = MatRestoreColumnIJ_SeqAIJ_Inode;
43962c451681SBarry Smith       B->ops->coloringpatch     = MatColoringPatch_SeqAIJ_Inode;
43972c451681SBarry Smith       B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4398150f0143SBarry Smith     } else {
43992c451681SBarry Smith       B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4400150f0143SBarry Smith     }
4401150f0143SBarry Smith   }
4402150f0143SBarry Smith   PetscFunctionReturn(0);
4403150f0143SBarry Smith }
4404150f0143SBarry Smith 
4405d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4406d71ae5a4SJacob Faibussowitsch {
44078758e1faSBarry Smith   PetscInt        k;
44088758e1faSBarry Smith   const PetscInt *vi;
44096e111a19SKarl Rupp 
441017454e89SShri Abhyankar   PetscFunctionBegin;
441117454e89SShri Abhyankar   vi = aj + ai[row];
441217454e89SShri Abhyankar   for (k = 0; k < nzl; k++) cols[k] = vi[k];
441317454e89SShri Abhyankar   vi        = aj + adiag[row];
441417454e89SShri Abhyankar   cols[nzl] = vi[0];
441517454e89SShri Abhyankar   vi        = aj + adiag[row + 1] + 1;
441617454e89SShri Abhyankar   for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
441717454e89SShri Abhyankar   PetscFunctionReturn(0);
441817454e89SShri Abhyankar }
44196936b636SHong Zhang /*
4420a02bda8eSBarry Smith    MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4421a02bda8eSBarry Smith    Modified from MatSeqAIJCheckInode().
44226936b636SHong Zhang 
44236936b636SHong Zhang    Input Parameters:
4424abb87a52SBarry Smith .  Mat A - ILU or LU matrix factor
4425abb87a52SBarry Smith 
44266936b636SHong Zhang */
4427d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4428d71ae5a4SJacob Faibussowitsch {
4429019b515eSShri Abhyankar   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
4430019b515eSShri Abhyankar   PetscInt        i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
44318758e1faSBarry Smith   PetscInt       *cols1, *cols2, *ns;
44328758e1faSBarry Smith   const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4433ace3abfcSBarry Smith   PetscBool       flag;
4434019b515eSShri Abhyankar 
4435019b515eSShri Abhyankar   PetscFunctionBegin;
4436019b515eSShri Abhyankar   if (!a->inode.use) PetscFunctionReturn(0);
4437abb87a52SBarry Smith   if (a->inode.checked) PetscFunctionReturn(0);
4438019b515eSShri Abhyankar 
4439019b515eSShri Abhyankar   m = A->rmap->n;
44402205254eSKarl Rupp   if (a->inode.size) ns = a->inode.size;
444148a46eb9SPierre Jolivet   else PetscCall(PetscMalloc1(m + 1, &ns));
4442019b515eSShri Abhyankar 
4443019b515eSShri Abhyankar   i          = 0;
4444019b515eSShri Abhyankar   node_count = 0;
44459566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4446019b515eSShri Abhyankar   while (i < m) {                       /* For each row */
4447019b515eSShri Abhyankar     nzl1 = ai[i + 1] - ai[i];           /* Number of nonzeros in L */
4448019b515eSShri Abhyankar     nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4449019b515eSShri Abhyankar     nzx  = nzl1 + nzu1 + 1;
4450019b515eSShri Abhyankar     MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i);
4451019b515eSShri Abhyankar 
4452019b515eSShri Abhyankar     /* Limits the number of elements in a node to 'a->inode.limit' */
4453019b515eSShri Abhyankar     for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4454019b515eSShri Abhyankar       nzl2 = ai[j + 1] - ai[j];
4455019b515eSShri Abhyankar       nzu2 = adiag[j] - adiag[j + 1] - 1;
4456019b515eSShri Abhyankar       nzy  = nzl2 + nzu2 + 1;
4457019b515eSShri Abhyankar       if (nzy != nzx) break;
44589566063dSJacob Faibussowitsch       PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
44599566063dSJacob Faibussowitsch       PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
44608758e1faSBarry Smith       if (!flag) break;
4461019b515eSShri Abhyankar     }
4462019b515eSShri Abhyankar     ns[node_count++] = blk_size;
4463019b515eSShri Abhyankar     i                = j;
4464019b515eSShri Abhyankar   }
44659566063dSJacob Faibussowitsch   PetscCall(PetscFree2(cols1, cols2));
4466019b515eSShri Abhyankar   /* If not enough inodes found,, do not use inode version of the routines */
4467be6adb11SBarry Smith   if (!m || node_count > .8 * m) {
44689566063dSJacob Faibussowitsch     PetscCall(PetscFree(ns));
44692205254eSKarl Rupp 
4470019b515eSShri Abhyankar     a->inode.node_count = 0;
44710298fd71SBarry Smith     a->inode.size       = NULL;
4472019b515eSShri Abhyankar     a->inode.use        = PETSC_FALSE;
44732205254eSKarl Rupp 
44749566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4475019b515eSShri Abhyankar   } else {
4476f4259b30SLisandro Dalcin     A->ops->mult              = NULL;
4477f4259b30SLisandro Dalcin     A->ops->sor               = NULL;
4478f4259b30SLisandro Dalcin     A->ops->multadd           = NULL;
4479f4259b30SLisandro Dalcin     A->ops->getrowij          = NULL;
4480f4259b30SLisandro Dalcin     A->ops->restorerowij      = NULL;
4481f4259b30SLisandro Dalcin     A->ops->getcolumnij       = NULL;
4482f4259b30SLisandro Dalcin     A->ops->restorecolumnij   = NULL;
4483f4259b30SLisandro Dalcin     A->ops->coloringpatch     = NULL;
4484f4259b30SLisandro Dalcin     A->ops->multdiagonalblock = NULL;
4485019b515eSShri Abhyankar     a->inode.node_count       = node_count;
4486019b515eSShri Abhyankar     a->inode.size             = ns;
44872205254eSKarl Rupp 
44889566063dSJacob Faibussowitsch     PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4489019b515eSShri Abhyankar   }
4490be6adb11SBarry Smith   a->inode.checked = PETSC_TRUE;
4491019b515eSShri Abhyankar   PetscFunctionReturn(0);
4492019b515eSShri Abhyankar }
4493019b515eSShri Abhyankar 
4494d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A)
4495d71ae5a4SJacob Faibussowitsch {
4496acf2f550SJed Brown   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4497acf2f550SJed Brown 
4498acf2f550SJed Brown   PetscFunctionBegin;
4499acf2f550SJed Brown   a->inode.ibdiagvalid = PETSC_FALSE;
4500acf2f550SJed Brown   PetscFunctionReturn(0);
4501acf2f550SJed Brown }
4502acf2f550SJed Brown 
45034c1414c8SBarry Smith /*
45044c1414c8SBarry Smith      This is really ugly. if inodes are used this replaces the
45054c1414c8SBarry Smith   permutations with ones that correspond to rows/cols of the matrix
45064c1414c8SBarry Smith   rather then inode blocks
45074c1414c8SBarry Smith */
4508d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4509d71ae5a4SJacob Faibussowitsch {
45104c1414c8SBarry Smith   PetscFunctionBegin;
4511cac4c232SBarry Smith   PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
45124c1414c8SBarry Smith   PetscFunctionReturn(0);
45134c1414c8SBarry Smith }
45144c1414c8SBarry Smith 
4515d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4516d71ae5a4SJacob Faibussowitsch {
45174c1414c8SBarry Smith   Mat_SeqAIJ     *a = (Mat_SeqAIJ *)A->data;
45185d0c19d7SBarry Smith   PetscInt        m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
45195d0c19d7SBarry Smith   const PetscInt *ridx, *cidx;
45204c1414c8SBarry Smith   PetscInt        row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx;
45214c1414c8SBarry Smith   PetscInt        nslim_col, *ns_col;
45224c1414c8SBarry Smith   IS              ris = *rperm, cis = *cperm;
45234c1414c8SBarry Smith 
45244c1414c8SBarry Smith   PetscFunctionBegin;
45254c1414c8SBarry Smith   if (!a->inode.size) PetscFunctionReturn(0);           /* no inodes so return */
45264c1414c8SBarry Smith   if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */
45274c1414c8SBarry Smith 
45289566063dSJacob Faibussowitsch   PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
45299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns));
45309566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(m, &permr, n, &permc));
45314c1414c8SBarry Smith 
45329566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(ris, &ridx));
45339566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(cis, &cidx));
45344c1414c8SBarry Smith 
45354c1414c8SBarry Smith   /* Form the inode structure for the rows of permuted matric using inv perm*/
45364c1414c8SBarry Smith   for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i];
45374c1414c8SBarry Smith 
45384c1414c8SBarry Smith   /* Construct the permutations for rows*/
45394c1414c8SBarry Smith   for (i = 0, row = 0; i < nslim_row; ++i) {
45404c1414c8SBarry Smith     indx      = ridx[i];
45414c1414c8SBarry Smith     start_val = tns[indx];
45424c1414c8SBarry Smith     end_val   = tns[indx + 1];
45434c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
45444c1414c8SBarry Smith   }
45454c1414c8SBarry Smith 
45464c1414c8SBarry Smith   /* Form the inode structure for the columns of permuted matrix using inv perm*/
45474c1414c8SBarry Smith   for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i];
45484c1414c8SBarry Smith 
45494c1414c8SBarry Smith   /* Construct permutations for columns */
45504c1414c8SBarry Smith   for (i = 0, col = 0; i < nslim_col; ++i) {
45514c1414c8SBarry Smith     indx      = cidx[i];
45524c1414c8SBarry Smith     start_val = tns[indx];
45534c1414c8SBarry Smith     end_val   = tns[indx + 1];
45544c1414c8SBarry Smith     for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
45554c1414c8SBarry Smith   }
45564c1414c8SBarry Smith 
45579566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
45589566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*rperm));
45599566063dSJacob Faibussowitsch   PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
45609566063dSJacob Faibussowitsch   PetscCall(ISSetPermutation(*cperm));
45614c1414c8SBarry Smith 
45629566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(ris, &ridx));
45639566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(cis, &cidx));
45644c1414c8SBarry Smith 
45659566063dSJacob Faibussowitsch   PetscCall(PetscFree(ns_col));
45669566063dSJacob Faibussowitsch   PetscCall(PetscFree2(permr, permc));
45679566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&cis));
45689566063dSJacob Faibussowitsch   PetscCall(ISDestroy(&ris));
45699566063dSJacob Faibussowitsch   PetscCall(PetscFree(tns));
45704c1414c8SBarry Smith   PetscFunctionReturn(0);
45714c1414c8SBarry Smith }
45724c1414c8SBarry Smith 
45734c1414c8SBarry Smith /*@C
457411a5261eSBarry Smith    MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
45754c1414c8SBarry Smith 
45763f9fe445SBarry Smith    Not Collective
45774c1414c8SBarry Smith 
45784c1414c8SBarry Smith    Input Parameter:
457911a5261eSBarry Smith .  A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
45804c1414c8SBarry Smith 
4581d8d19677SJose E. Roman    Output Parameters:
45824c1414c8SBarry Smith +  node_count - no of inodes present in the matrix.
45834c1414c8SBarry Smith .  sizes      - an array of size node_count,with sizes of each inode.
45844c1414c8SBarry Smith -  limit      - the max size used to generate the inodes.
45854c1414c8SBarry Smith 
45864c1414c8SBarry Smith    Level: advanced
45874c1414c8SBarry Smith 
458811a5261eSBarry Smith    Note:
458995452b02SPatrick Sanan     This routine returns some internal storage information
45904c1414c8SBarry Smith    of the matrix, it is intended to be used by advanced users.
45914c1414c8SBarry Smith    It should be called after the matrix is assembled.
45924c1414c8SBarry Smith    The contents of the sizes[] array should not be changed.
45930298fd71SBarry Smith    NULL may be passed for information not requested.
45944c1414c8SBarry Smith 
4595db781477SPatrick Sanan .seealso: `MatGetInfo()`
45964c1414c8SBarry Smith @*/
4597d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4598d71ae5a4SJacob Faibussowitsch {
45995f80ce2aSJacob Faibussowitsch   PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
46004c1414c8SBarry Smith 
46014c1414c8SBarry Smith   PetscFunctionBegin;
46025f80ce2aSJacob Faibussowitsch   PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
46039566063dSJacob Faibussowitsch   PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
46049566063dSJacob Faibussowitsch   if (f) PetscCall((*f)(A, node_count, sizes, limit));
46054c1414c8SBarry Smith   PetscFunctionReturn(0);
46064c1414c8SBarry Smith }
46074c1414c8SBarry Smith 
4608d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4609d71ae5a4SJacob Faibussowitsch {
46104c1414c8SBarry Smith   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
46114c1414c8SBarry Smith 
46124c1414c8SBarry Smith   PetscFunctionBegin;
46134c1414c8SBarry Smith   if (node_count) *node_count = a->inode.node_count;
46144c1414c8SBarry Smith   if (sizes) *sizes = a->inode.size;
46154c1414c8SBarry Smith   if (limit) *limit = a->inode.limit;
46164c1414c8SBarry Smith   PetscFunctionReturn(0);
46174c1414c8SBarry Smith }
4618