12593348eSBarry Smith 235aab85fSBarry Smith #if !defined(__BAIJ_H) 335aab85fSBarry Smith #define __BAIJ_H 47c4f633dSBarry Smith #include "private/matimpl.h" 57c4f633dSBarry Smith #include "../src/mat/impls/aij/seq/aij.h" 6a4005a5dSBarry Smith #include "../src/mat/impls/baij/seq/ftn-kernels/fsolvebaij.h" 72593348eSBarry Smith 82593348eSBarry Smith /* 935aab85fSBarry Smith MATSEQBAIJ format - Block compressed row storage. The i[] and j[] 10e24b481bSBarry Smith arrays start at 0. 112593348eSBarry Smith */ 122593348eSBarry Smith 136c6c5352SBarry Smith /* This header is shared by the SeqSBAIJ matrix */ 146c6c5352SBarry Smith #define SEQBAIJHEADER \ 15521d7252SBarry Smith PetscInt bs2; /* square of block size */ \ 16690b6cddSBarry Smith PetscInt mbs,nbs; /* rows/bs, columns/bs */ \ 176c6c5352SBarry Smith PetscScalar *mult_work; /* work array for matrix vector product*/ \ 18dd6ea824SBarry Smith MatScalar *saved_values; \ 196c6c5352SBarry Smith \ 206ad2eaddSHong Zhang Mat sbaijMat; /* mat in sbaij format */ \ 216c6c5352SBarry Smith \ 226c6c5352SBarry Smith PetscTruth pivotinblocks; /* pivot inside factorization of each diagonal block */ \ 236c6c5352SBarry Smith \ 24dd6ea824SBarry Smith MatScalar *idiag; /* inverse of block diagonal */ \ 25e6b907acSBarry Smith PetscTruth idiagvalid /* if above has correct/current values */ 26e6b907acSBarry Smith 276c6c5352SBarry Smith 286c6c5352SBarry Smith typedef struct { 29dd6ea824SBarry Smith SEQAIJHEADER(MatScalar); 30e6b907acSBarry Smith SEQBAIJHEADER; 3135aab85fSBarry Smith } Mat_SeqBAIJ; 322593348eSBarry Smith 33f349cc81SSatish Balay EXTERN_C_BEGIN 34ab93d7beSBarry Smith EXTERN PetscErrorCode MatSeqBAIJSetPreallocation_SeqBAIJ(Mat,PetscInt,PetscInt,PetscInt*); 35f349cc81SSatish Balay EXTERN_C_END 360481f469SBarry Smith EXTERN PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat,Mat,IS,IS,const MatFactorInfo*); 370481f469SBarry Smith EXTERN PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat,Mat,IS,const MatFactorInfo*); 380481f469SBarry Smith EXTERN PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat,Mat,IS,const MatFactorInfo*); 390481f469SBarry Smith EXTERN PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat,Mat,const MatFactorInfo*); 400481f469SBarry Smith EXTERN PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 41dfbe8321SBarry Smith EXTERN PetscErrorCode MatDuplicate_SeqBAIJ(Mat,MatDuplicateOption,Mat*); 422af78befSBarry Smith EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat,PetscTruth*,PetscInt*); 43dfbe8321SBarry Smith EXTERN PetscErrorCode MatMarkDiagonal_SeqBAIJ(Mat); 44c8342467SHong Zhang EXTERN PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat,IS,IS,const MatFactorInfo*,Mat*); 45be3590efSBarry Smith 460481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorSymbolic_SeqBAIJ(Mat,Mat,IS,IS,const MatFactorInfo*); 470481f469SBarry Smith EXTERN PetscErrorCode MatLUFactor_SeqBAIJ(Mat,IS,IS,const MatFactorInfo*); 48690b6cddSBarry Smith EXTERN PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat,PetscInt,IS*,PetscInt); 494aa3045dSJed Brown EXTERN PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat,IS,IS,MatReuse,Mat*); 50690b6cddSBarry Smith EXTERN PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat,PetscInt,const IS[],const IS[],MatReuse,Mat*[]); 51dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultTranspose_SeqBAIJ(Mat,Vec,Vec); 52dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat,Vec,Vec,Vec); 53f4df32b1SMatthew Knepley EXTERN PetscErrorCode MatScale_SeqBAIJ(Mat,PetscScalar); 54dfbe8321SBarry Smith EXTERN PetscErrorCode MatNorm_SeqBAIJ(Mat,NormType,PetscReal *); 55dfbe8321SBarry Smith EXTERN PetscErrorCode MatEqual_SeqBAIJ(Mat,Mat,PetscTruth*); 56dfbe8321SBarry Smith EXTERN PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat,Vec); 57dfbe8321SBarry Smith EXTERN PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat,Vec,Vec); 58dfbe8321SBarry Smith EXTERN PetscErrorCode MatGetInfo_SeqBAIJ(Mat,MatInfoType,MatInfo *); 59dfbe8321SBarry Smith EXTERN PetscErrorCode MatZeroEntries_SeqBAIJ(Mat); 6054138f6bSKris Buschelman 61dfbe8321SBarry Smith EXTERN PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat); 6254138f6bSKris Buschelman 63dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_1(Mat,Vec,Vec); 64dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat,Vec,Vec); 65dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_2(Mat,Vec,Vec); 668bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat,Vec,Vec); 67dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat,Vec,Vec); 688bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 698bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 70dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_3(Mat,Vec,Vec); 718bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat,Vec,Vec); 72dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat,Vec,Vec); 738bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 74dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_4(Mat,Vec,Vec); 758bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat,Vec,Vec); 76dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec); 778bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 7854138f6bSKris Buschelman #if defined(PETSC_HAVE_SSE) 79dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat,Vec,Vec); 80dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat,Vec,Vec); 81dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat,Vec,Vec); 824110d75bSKris Buschelman #endif 83dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_5(Mat,Vec,Vec); 848bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat,Vec,Vec); 85dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat,Vec,Vec); 868bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 87dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_6(Mat,Vec,Vec); 888bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat,Vec,Vec); 89dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat,Vec,Vec); 908bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 91dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_7(Mat,Vec,Vec); 928bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat,Vec,Vec); 93dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat,Vec,Vec); 948bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 95dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolve_SeqBAIJ_N(Mat,Vec,Vec); 968bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat,Vec,Vec); 978bfb2e2bSShri Abhyankar EXTERN PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 98dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat,Vec,Vec); 99dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat,Vec,Vec); 100dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat,Vec,Vec); 101dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat,Vec,Vec); 102dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat,Vec,Vec); 103dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat,Vec,Vec); 104dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat,Vec,Vec); 105dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat,Vec,Vec); 106dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat,Vec,Vec); 107dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat,Vec,Vec); 108dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat,Vec,Vec); 109dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat,Vec,Vec); 110dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat,Vec,Vec); 111dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat,Vec,Vec); 112dfbe8321SBarry Smith EXTERN PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat,Vec,Vec); 11354138f6bSKris Buschelman 114faca2338SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat,Mat,const MatFactorInfo*); 115faca2338SShri Abhyankar 1160481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat,Mat,const MatFactorInfo*); 1170481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat,Mat,const MatFactorInfo*); 118b588c5a2SHong Zhang EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(Mat,Mat,const MatFactorInfo*); 1190481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 1204c000e2eSHong Zhang EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat,Mat,const MatFactorInfo*); 1214c000e2eSHong Zhang 1220481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat,Mat,const MatFactorInfo*); 12317542729SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_newdatastruct(Mat,Mat,const MatFactorInfo*); 1240481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 12517542729SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat,Mat,const MatFactorInfo*); 1260481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat,Mat,const MatFactorInfo*); 127209027a4SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_newdatastruct(Mat,Mat,const MatFactorInfo*); 1280481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 129209027a4SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat,Mat,const MatFactorInfo*); 13054138f6bSKris Buschelman #if defined(PETSC_HAVE_SSE) 1310481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat,Mat,const MatFactorInfo*); 1320481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat,Mat,const MatFactorInfo*); 13354138f6bSKris Buschelman #else 13454138f6bSKris Buschelman #endif 1350481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5(Mat,Mat,const MatFactorInfo*); 136*0deeaf61SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_newdatastruct(Mat,Mat,const MatFactorInfo*); 1370481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 138*0deeaf61SShri Abhyankar EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat,Mat,const MatFactorInfo*); 1390481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_6(Mat,Mat,const MatFactorInfo*); 1400481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 1410481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_7(Mat,Mat,const MatFactorInfo*); 1420481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering(Mat,Mat,const MatFactorInfo*); 1430481f469SBarry Smith EXTERN PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat,Mat,const MatFactorInfo*); 14454138f6bSKris Buschelman 145dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_1(Mat,Vec,Vec); 146dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_2(Mat,Vec,Vec); 147dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_3(Mat,Vec,Vec); 148dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_4(Mat,Vec,Vec); 149dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_5(Mat,Vec,Vec); 150dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_6(Mat,Vec,Vec); 151dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_7(Mat,Vec,Vec); 152dfbe8321SBarry Smith EXTERN PetscErrorCode MatMult_SeqBAIJ_N(Mat,Vec,Vec); 15354138f6bSKris Buschelman 154dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat,Vec,Vec,Vec); 155dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat,Vec,Vec,Vec); 156dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat,Vec,Vec,Vec); 157dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat,Vec,Vec,Vec); 158dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat,Vec,Vec,Vec); 159dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat,Vec,Vec,Vec); 160dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat,Vec,Vec,Vec); 161dfbe8321SBarry Smith EXTERN PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat,Vec,Vec,Vec); 162a313700dSBarry Smith EXTERN PetscErrorCode MatLoad_SeqBAIJ(PetscViewer, const MatType,Mat*); 1632593348eSBarry Smith 1644c000e2eSHong Zhang /* 1654c000e2eSHong Zhang Kernel_A_gets_A_times_B_2: A = A * B with size bs=2 1664c000e2eSHong Zhang 1674c000e2eSHong Zhang Input Parameters: 1684c000e2eSHong Zhang + A,B - square bs by bs arrays stored in column major order 1694c000e2eSHong Zhang - W - bs*bs work arrary 1704c000e2eSHong Zhang 1714c000e2eSHong Zhang Output Parameter: 1724c000e2eSHong Zhang . A = A * B 1734c000e2eSHong Zhang */ 1744c000e2eSHong Zhang 1754c000e2eSHong Zhang #define Kernel_A_gets_A_times_B_2(A,B,W) 0;\ 1764c000e2eSHong Zhang {\ 1774c000e2eSHong Zhang PetscMemcpy(W,A,4*sizeof(MatScalar));\ 1784c000e2eSHong Zhang A[0] = W[0]*B[0] + W[2]*B[1];\ 1794c000e2eSHong Zhang A[1] = W[1]*B[0] + W[3]*B[1];\ 1804c000e2eSHong Zhang A[2] = W[0]*B[2] + W[2]*B[3];\ 1814c000e2eSHong Zhang A[3] = W[1]*B[2] + W[3]*B[3];\ 1824c000e2eSHong Zhang } 1834c000e2eSHong Zhang 1844c000e2eSHong Zhang /* 1854c000e2eSHong Zhang Kernel_A_gets_A_minus_B_times_C_2: A = A - B * C with size bs=2 1864c000e2eSHong Zhang 1874c000e2eSHong Zhang Input Parameters: 1884c000e2eSHong Zhang + A,B,C - square bs by bs arrays stored in column major order 1894c000e2eSHong Zhang 1904c000e2eSHong Zhang Output Parameter: 1914c000e2eSHong Zhang . A = A - B*C 1924c000e2eSHong Zhang */ 1934c000e2eSHong Zhang 1944c000e2eSHong Zhang #define Kernel_A_gets_A_minus_B_times_C_2(A,B,C) 0;\ 1954c000e2eSHong Zhang {\ 1964c000e2eSHong Zhang A[0] -= B[0]*C[0] + B[2]*C[1];\ 1974c000e2eSHong Zhang A[1] -= B[1]*C[0] + B[3]*C[1];\ 1984c000e2eSHong Zhang A[2] -= B[0]*C[2] + B[2]*C[3];\ 1994c000e2eSHong Zhang A[3] -= B[1]*C[2] + B[3]*C[3];\ 2004c000e2eSHong Zhang } 2014c000e2eSHong Zhang 20217542729SShri Abhyankar /* 20317542729SShri Abhyankar Kernel_A_gets_A_times_B_3: A = A * B with size bs=3 20417542729SShri Abhyankar 20517542729SShri Abhyankar Input Parameters: 20617542729SShri Abhyankar + A,B - square bs by bs arrays stored in column major order 20717542729SShri Abhyankar - W - bs*bs work arrary 20817542729SShri Abhyankar 20917542729SShri Abhyankar Output Parameter: 21017542729SShri Abhyankar . A = A * B 21117542729SShri Abhyankar */ 21217542729SShri Abhyankar 21317542729SShri Abhyankar #define Kernel_A_gets_A_times_B_3(A,B,W) 0;\ 21417542729SShri Abhyankar {\ 21517542729SShri Abhyankar PetscMemcpy(W,A,9*sizeof(MatScalar));\ 21617542729SShri Abhyankar A[0] = W[0]*B[0] + W[3]*B[1] + W[6]*B[2];\ 21717542729SShri Abhyankar A[1] = W[1]*B[0] + W[4]*B[1] + W[7]*B[2];\ 21817542729SShri Abhyankar A[2] = W[2]*B[0] + W[5]*B[1] + W[8]*B[2];\ 21917542729SShri Abhyankar A[3] = W[0]*B[3] + W[3]*B[4] + W[6]*B[5];\ 22017542729SShri Abhyankar A[4] = W[1]*B[3] + W[4]*B[4] + W[7]*B[5];\ 22117542729SShri Abhyankar A[5] = W[2]*B[3] + W[5]*B[4] + W[8]*B[5];\ 22217542729SShri Abhyankar A[6] = W[0]*B[6] + W[3]*B[7] + W[6]*B[8];\ 22317542729SShri Abhyankar A[7] = W[1]*B[6] + W[4]*B[7] + W[7]*B[8];\ 22417542729SShri Abhyankar A[8] = W[2]*B[6] + W[5]*B[7] + W[8]*B[8];\ 22517542729SShri Abhyankar } 22617542729SShri Abhyankar 22717542729SShri Abhyankar /* 22817542729SShri Abhyankar Kernel_A_gets_A_minus_B_times_C_3: A = A - B * C with size bs=3 22917542729SShri Abhyankar 23017542729SShri Abhyankar Input Parameters: 23117542729SShri Abhyankar + A,B,C - square bs by bs arrays stored in column major order 23217542729SShri Abhyankar 23317542729SShri Abhyankar Output Parameter: 23417542729SShri Abhyankar . A = A - B*C 23517542729SShri Abhyankar */ 23617542729SShri Abhyankar 23717542729SShri Abhyankar #define Kernel_A_gets_A_minus_B_times_C_3(A,B,C) 0;\ 23817542729SShri Abhyankar {\ 23917542729SShri Abhyankar A[0] -= B[0]*C[0] + B[3]*C[1] + B[6]*C[2];\ 24017542729SShri Abhyankar A[1] -= B[1]*C[0] + B[4]*C[1] + B[7]*C[2];\ 24117542729SShri Abhyankar A[2] -= B[2]*C[0] + B[5]*C[1] + B[8]*C[2];\ 24217542729SShri Abhyankar A[3] -= B[0]*C[3] + B[3]*C[4] + B[6]*C[5];\ 24317542729SShri Abhyankar A[4] -= B[1]*C[3] + B[4]*C[4] + B[7]*C[5];\ 24417542729SShri Abhyankar A[5] -= B[2]*C[3] + B[5]*C[4] + B[8]*C[5];\ 24517542729SShri Abhyankar A[6] -= B[0]*C[6] + B[3]*C[7] + B[6]*C[8];\ 24617542729SShri Abhyankar A[7] -= B[1]*C[6] + B[4]*C[7] + B[7]*C[8];\ 24717542729SShri Abhyankar A[8] -= B[2]*C[6] + B[5]*C[7] + B[8]*C[8];\ 24817542729SShri Abhyankar } 24917542729SShri Abhyankar 250209027a4SShri Abhyankar /* 251209027a4SShri Abhyankar Kernel_A_gets_A_times_B_4: A = A * B with size bs=4 252209027a4SShri Abhyankar 253209027a4SShri Abhyankar Input Parameters: 254209027a4SShri Abhyankar + A,B - square bs by bs arrays stored in column major order 255209027a4SShri Abhyankar - W - bs*bs work arrary 256209027a4SShri Abhyankar 257209027a4SShri Abhyankar Output Parameter: 258209027a4SShri Abhyankar . A = A * B 259209027a4SShri Abhyankar */ 260209027a4SShri Abhyankar 261209027a4SShri Abhyankar #define Kernel_A_gets_A_times_B_4(A,B,W) 0;\ 262209027a4SShri Abhyankar {\ 263209027a4SShri Abhyankar PetscMemcpy(W,A,16*sizeof(MatScalar));\ 264209027a4SShri Abhyankar A[0] = W[0]*B[0] + W[4]*B[1] + W[8]*B[2] + W[12]*B[3];\ 265209027a4SShri Abhyankar A[1] = W[1]*B[0] + W[5]*B[1] + W[9]*B[2] + W[13]*B[3];\ 266209027a4SShri Abhyankar A[2] = W[2]*B[0] + W[6]*B[1] + W[10]*B[2] + W[14]*B[3];\ 267209027a4SShri Abhyankar A[3] = W[3]*B[0] + W[7]*B[1] + W[11]*B[2] + W[15]*B[3];\ 268209027a4SShri Abhyankar A[4] = W[0]*B[4] + W[4]*B[5] + W[8]*B[6] + W[12]*B[7];\ 269209027a4SShri Abhyankar A[5] = W[1]*B[4] + W[5]*B[5] + W[9]*B[6] + W[13]*B[7];\ 270209027a4SShri Abhyankar A[6] = W[2]*B[4] + W[6]*B[5] + W[10]*B[6] + W[14]*B[7];\ 271209027a4SShri Abhyankar A[7] = W[3]*B[4] + W[7]*B[5] + W[11]*B[6] + W[15]*B[7];\ 272209027a4SShri Abhyankar A[8] = W[0]*B[8] + W[4]*B[9] + W[8]*B[10] + W[12]*B[11];\ 273209027a4SShri Abhyankar A[9] = W[1]*B[8] + W[5]*B[9] + W[9]*B[10] + W[13]*B[11];\ 274209027a4SShri Abhyankar A[10] = W[2]*B[8] + W[6]*B[9] + W[10]*B[10] + W[14]*B[11];\ 275209027a4SShri Abhyankar A[11] = W[3]*B[8] + W[7]*B[9] + W[11]*B[10] + W[15]*B[11];\ 276209027a4SShri Abhyankar A[12] = W[0]*B[12] + W[4]*B[13] + W[8]*B[14] + W[12]*B[15];\ 277209027a4SShri Abhyankar A[13] = W[1]*B[12] + W[5]*B[13] + W[9]*B[14] + W[13]*B[15];\ 278209027a4SShri Abhyankar A[14] = W[2]*B[12] + W[6]*B[13] + W[10]*B[14] + W[14]*B[15];\ 279209027a4SShri Abhyankar A[15] = W[3]*B[12] + W[7]*B[13] + W[11]*B[14] + W[15]*B[15];\ 280209027a4SShri Abhyankar } 281209027a4SShri Abhyankar 282209027a4SShri Abhyankar /* 283209027a4SShri Abhyankar Kernel_A_gets_A_minus_B_times_C_4: A = A - B * C with size bs=4 284209027a4SShri Abhyankar 285209027a4SShri Abhyankar Input Parameters: 286209027a4SShri Abhyankar + A,B,C - square bs by bs arrays stored in column major order 287209027a4SShri Abhyankar 288209027a4SShri Abhyankar Output Parameter: 289209027a4SShri Abhyankar . A = A - B*C 290209027a4SShri Abhyankar */ 291209027a4SShri Abhyankar 292209027a4SShri Abhyankar #define Kernel_A_gets_A_minus_B_times_C_4(A,B,C) 0;\ 293209027a4SShri Abhyankar {\ 294209027a4SShri Abhyankar A[0] -= B[0]*C[0] + B[4]*C[1] + B[8]*C[2] + B[12]*C[3];\ 295209027a4SShri Abhyankar A[1] -= B[1]*C[0] + B[5]*C[1] + B[9]*C[2] + B[13]*C[3];\ 296209027a4SShri Abhyankar A[2] -= B[2]*C[0] + B[6]*C[1] + B[10]*C[2] + B[14]*C[3];\ 297209027a4SShri Abhyankar A[3] -= B[3]*C[0] + B[7]*C[1] + B[11]*C[2] + B[15]*C[3];\ 298209027a4SShri Abhyankar A[4] -= B[0]*C[4] + B[4]*C[5] + B[8]*C[6] + B[12]*C[7];\ 299209027a4SShri Abhyankar A[5] -= B[1]*C[4] + B[5]*C[5] + B[9]*C[6] + B[13]*C[7];\ 300209027a4SShri Abhyankar A[6] -= B[2]*C[4] + B[6]*C[5] + B[10]*C[6] + B[14]*C[7];\ 301209027a4SShri Abhyankar A[7] -= B[3]*C[4] + B[7]*C[5] + B[11]*C[6] + B[15]*C[7];\ 302209027a4SShri Abhyankar A[8] -= B[0]*C[8] + B[4]*C[9] + B[8]*C[10] + B[12]*C[11];\ 303209027a4SShri Abhyankar A[9] -= B[1]*C[8] + B[5]*C[9] + B[9]*C[10] + B[13]*C[11];\ 304209027a4SShri Abhyankar A[10] -= B[2]*C[8] + B[6]*C[9] + B[10]*C[10] + B[14]*C[11];\ 305209027a4SShri Abhyankar A[11] -= B[3]*C[8] + B[7]*C[9] + B[11]*C[10] + B[15]*C[11];\ 306209027a4SShri Abhyankar A[12] -= B[0]*C[12] + B[4]*C[13] + B[8]*C[14] + B[12]*C[15];\ 307209027a4SShri Abhyankar A[13] -= B[1]*C[12] + B[5]*C[13] + B[9]*C[14] + B[13]*C[15];\ 308209027a4SShri Abhyankar A[14] -= B[2]*C[12] + B[6]*C[13] + B[10]*C[14] + B[14]*C[15];\ 309209027a4SShri Abhyankar A[15] -= B[3]*C[12] + B[7]*C[13] + B[11]*C[14] + B[15]*C[15];\ 310209027a4SShri Abhyankar } 311209027a4SShri Abhyankar 312*0deeaf61SShri Abhyankar 313*0deeaf61SShri Abhyankar #define Kernel_A_gets_A_times_B_5(A,B,W) 0;\ 314*0deeaf61SShri Abhyankar {\ 315*0deeaf61SShri Abhyankar PetscMemcpy(W,A,25*sizeof(MatScalar));\ 316*0deeaf61SShri Abhyankar A[0] = W[0]*B[0] + W[5]*B[1] + W[10]*B[2] + W[15]*B[3] + W[20]*B[4];\ 317*0deeaf61SShri Abhyankar A[1] = W[1]*B[0] + W[6]*B[1] + W[11]*B[2] + W[16]*B[3] + W[21]*B[4];\ 318*0deeaf61SShri Abhyankar A[2] = W[2]*B[0] + W[7]*B[1] + W[12]*B[2] + W[17]*B[3] + W[22]*B[4];\ 319*0deeaf61SShri Abhyankar A[3] = W[3]*B[0] + W[8]*B[1] + W[13]*B[2] + W[18]*B[3] + W[23]*B[4];\ 320*0deeaf61SShri Abhyankar A[4] = W[4]*B[0] + W[9]*B[1] + W[14]*B[2] + W[19]*B[3] + W[24]*B[4];\ 321*0deeaf61SShri Abhyankar A[5] = W[0]*B[5] + W[5]*B[6] + W[10]*B[7] + W[15]*B[8] + W[20]*B[9];\ 322*0deeaf61SShri Abhyankar A[6] = W[1]*B[5] + W[6]*B[6] + W[11]*B[7] + W[16]*B[8] + W[21]*B[9];\ 323*0deeaf61SShri Abhyankar A[7] = W[2]*B[5] + W[7]*B[6] + W[12]*B[7] + W[17]*B[8] + W[22]*B[9];\ 324*0deeaf61SShri Abhyankar A[8] = W[3]*B[5] + W[8]*B[6] + W[13]*B[7] + W[18]*B[8] + W[23]*B[9];\ 325*0deeaf61SShri Abhyankar A[9] = W[4]*B[5] + W[9]*B[6] + W[14]*B[7] + W[19]*B[8] + W[24]*B[9];\ 326*0deeaf61SShri Abhyankar A[10] = W[0]*B[10] + W[5]*B[11] + W[10]*B[12] + W[15]*B[13] + W[20]*B[14];\ 327*0deeaf61SShri Abhyankar A[11] = W[1]*B[10] + W[6]*B[11] + W[11]*B[12] + W[16]*B[13] + W[21]*B[14];\ 328*0deeaf61SShri Abhyankar A[12] = W[2]*B[10] + W[7]*B[11] + W[12]*B[12] + W[17]*B[13] + W[22]*B[14];\ 329*0deeaf61SShri Abhyankar A[13] = W[3]*B[10] + W[8]*B[11] + W[13]*B[12] + W[18]*B[13] + W[23]*B[14];\ 330*0deeaf61SShri Abhyankar A[14] = W[4]*B[10] + W[9]*B[11] + W[14]*B[12] + W[19]*B[13] + W[24]*B[14];\ 331*0deeaf61SShri Abhyankar A[15] = W[0]*B[15] + W[5]*B[16] + W[10]*B[17] + W[15]*B[18] + W[20]*B[19];\ 332*0deeaf61SShri Abhyankar A[16] = W[1]*B[15] + W[6]*B[16] + W[11]*B[17] + W[16]*B[18] + W[21]*B[19];\ 333*0deeaf61SShri Abhyankar A[17] = W[2]*B[15] + W[7]*B[16] + W[12]*B[17] + W[17]*B[18] + W[22]*B[19];\ 334*0deeaf61SShri Abhyankar A[18] = W[3]*B[15] + W[8]*B[16] + W[13]*B[17] + W[18]*B[18] + W[23]*B[19];\ 335*0deeaf61SShri Abhyankar A[19] = W[4]*B[15] + W[9]*B[16] + W[14]*B[17] + W[19]*B[18] + W[24]*B[19];\ 336*0deeaf61SShri Abhyankar A[20] = W[0]*B[20] + W[5]*B[21] + W[10]*B[22] + W[15]*B[23] + W[20]*B[24];\ 337*0deeaf61SShri Abhyankar A[21] = W[1]*B[20] + W[6]*B[21] + W[11]*B[22] + W[16]*B[23] + W[21]*B[24];\ 338*0deeaf61SShri Abhyankar A[22] = W[2]*B[20] + W[7]*B[21] + W[12]*B[22] + W[17]*B[23] + W[22]*B[24];\ 339*0deeaf61SShri Abhyankar A[23] = W[3]*B[20] + W[8]*B[21] + W[13]*B[22] + W[18]*B[23] + W[23]*B[24];\ 340*0deeaf61SShri Abhyankar A[24] = W[4]*B[20] + W[9]*B[21] + W[14]*B[22] + W[19]*B[23] + W[24]*B[24];\ 341*0deeaf61SShri Abhyankar } 342*0deeaf61SShri Abhyankar 343*0deeaf61SShri Abhyankar /* 344*0deeaf61SShri Abhyankar Kernel_A_gets_A_minus_B_times_C_5: A = A - B * C with size bs=5 345*0deeaf61SShri Abhyankar 346*0deeaf61SShri Abhyankar Input Parameters: 347*0deeaf61SShri Abhyankar + A,B,C - square bs by bs arrays stored in column major order 348*0deeaf61SShri Abhyankar 349*0deeaf61SShri Abhyankar Output Parameter: 350*0deeaf61SShri Abhyankar . A = A - B*C 351*0deeaf61SShri Abhyankar */ 352*0deeaf61SShri Abhyankar 353*0deeaf61SShri Abhyankar #define Kernel_A_gets_A_minus_B_times_C_5(A,B,C) 0;\ 354*0deeaf61SShri Abhyankar {\ 355*0deeaf61SShri Abhyankar A[0] -= B[0]*C[0] + B[5]*C[1] + B[10]*C[2] + B[15]*C[3] + B[20]*C[4];\ 356*0deeaf61SShri Abhyankar A[1] -= B[1]*C[0] + B[6]*C[1] + B[11]*C[2] + B[16]*C[3] + B[21]*C[4];\ 357*0deeaf61SShri Abhyankar A[2] -= B[2]*C[0] + B[7]*C[1] + B[12]*C[2] + B[17]*C[3] + B[22]*C[4];\ 358*0deeaf61SShri Abhyankar A[3] -= B[3]*C[0] + B[8]*C[1] + B[13]*C[2] + B[18]*C[3] + B[23]*C[4];\ 359*0deeaf61SShri Abhyankar A[4] -= B[4]*C[0] + B[9]*C[1] + B[14]*C[2] + B[19]*C[3] + B[24]*C[4];\ 360*0deeaf61SShri Abhyankar A[5] -= B[0]*C[5] + B[5]*C[6] + B[10]*C[7] + B[15]*C[8] + B[20]*C[9];\ 361*0deeaf61SShri Abhyankar A[6] -= B[1]*C[5] + B[6]*C[6] + B[11]*C[7] + B[16]*C[8] + B[21]*C[9];\ 362*0deeaf61SShri Abhyankar A[7] -= B[2]*C[5] + B[7]*C[6] + B[12]*C[7] + B[17]*C[8] + B[22]*C[9];\ 363*0deeaf61SShri Abhyankar A[8] -= B[3]*C[5] + B[8]*C[6] + B[13]*C[7] + B[18]*C[8] + B[23]*C[9];\ 364*0deeaf61SShri Abhyankar A[9] -= B[4]*C[5] + B[9]*C[6] + B[14]*C[7] + B[19]*C[8] + B[24]*C[9];\ 365*0deeaf61SShri Abhyankar A[10] -= B[0]*C[10] + B[5]*C[11] + B[10]*C[12] + B[15]*C[13] + B[20]*C[14];\ 366*0deeaf61SShri Abhyankar A[11] -= B[1]*C[10] + B[6]*C[11] + B[11]*C[12] + B[16]*C[13] + B[21]*C[14];\ 367*0deeaf61SShri Abhyankar A[12] -= B[2]*C[10] + B[7]*C[11] + B[12]*C[12] + B[17]*C[13] + B[22]*C[14];\ 368*0deeaf61SShri Abhyankar A[13] -= B[3]*C[10] + B[8]*C[11] + B[13]*C[12] + B[18]*C[13] + B[23]*C[14];\ 369*0deeaf61SShri Abhyankar A[14] -= B[4]*C[10] + B[9]*C[11] + B[14]*C[12] + B[19]*C[13] + B[24]*C[14];\ 370*0deeaf61SShri Abhyankar A[15] -= B[0]*C[15] + B[5]*C[16] + B[10]*C[17] + B[15]*C[18] + B[20]*C[19];\ 371*0deeaf61SShri Abhyankar A[16] -= B[1]*C[15] + B[6]*C[16] + B[11]*C[17] + B[16]*C[18] + B[21]*C[19];\ 372*0deeaf61SShri Abhyankar A[17] -= B[2]*C[15] + B[7]*C[16] + B[12]*C[17] + B[17]*C[18] + B[22]*C[19];\ 373*0deeaf61SShri Abhyankar A[18] -= B[3]*C[15] + B[8]*C[16] + B[13]*C[17] + B[18]*C[18] + B[23]*C[19];\ 374*0deeaf61SShri Abhyankar A[19] -= B[4]*C[15] + B[9]*C[16] + B[14]*C[17] + B[19]*C[18] + B[24]*C[19];\ 375*0deeaf61SShri Abhyankar A[20] -= B[0]*C[20] + B[5]*C[21] + B[10]*C[22] + B[15]*C[23] + B[20]*C[24];\ 376*0deeaf61SShri Abhyankar A[21] -= B[1]*C[20] + B[6]*C[21] + B[11]*C[22] + B[16]*C[23] + B[21]*C[24];\ 377*0deeaf61SShri Abhyankar A[22] -= B[2]*C[20] + B[7]*C[21] + B[12]*C[22] + B[17]*C[23] + B[22]*C[24];\ 378*0deeaf61SShri Abhyankar A[23] -= B[3]*C[20] + B[8]*C[21] + B[13]*C[22] + B[18]*C[23] + B[23]*C[24];\ 379*0deeaf61SShri Abhyankar A[24] -= B[4]*C[20] + B[9]*C[21] + B[14]*C[22] + B[19]*C[23] + B[24]*C[24];\ 380*0deeaf61SShri Abhyankar } 381*0deeaf61SShri Abhyankar 3822593348eSBarry Smith #endif 383