1c96caaccSSatish Balay! 2c96caaccSSatish Balay! 3c96caaccSSatish Balay! Fortran kernel for sparse triangular solve in the BAIJ matrix format 4c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e. 5c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering() 6c96caaccSSatish Balay! 7c96caaccSSatish Balay#include <petsc/finclude/petscsys.h> 8c96caaccSSatish Balay! 9c96caaccSSatish Balay 100ccf82acSMartin Diehlpure subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b) 110ccf82acSMartin Diehl implicit none (type, external) 120ccf82acSMartin Diehl MatScalar, intent(in) :: a(0:*) 130ccf82acSMartin Diehl PetscScalar, intent(inout) :: x(0:*) 140ccf82acSMartin Diehl PetscScalar, intent(in) :: b(0:*) 150ccf82acSMartin Diehl PetscInt, intent(in) :: n 160ccf82acSMartin Diehl PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*) 17c96caaccSSatish Balay 180ccf82acSMartin Diehl PetscInt :: i,j,jstart,jend 190ccf82acSMartin Diehl PetscInt :: idx,ax,jdx 20*ff45ff59SMartin Diehl PetscScalar :: s(0:3) 210ccf82acSMartin Diehl 22c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 23c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 24c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 25c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 26c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 27c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 28c96caaccSSatish Balay 290ccf82acSMartin Diehl ! 300ccf82acSMartin Diehl ! Forward Solve 310ccf82acSMartin Diehl ! 32d66e387eSMartin Diehl x(0:3) = b(0:3) 33c96caaccSSatish Balay idx = 0 340113e719SMartin Diehl do i=1,n-1 35c96caaccSSatish Balay jstart = ai(i) 36c96caaccSSatish Balay jend = adiag(i) - 1 37c96caaccSSatish Balay ax = 16*jstart 38c96caaccSSatish Balay idx = idx + 4 39*ff45ff59SMartin Diehl s(0:3) = b(idx+0:idx+3) 400113e719SMartin Diehl do j=jstart,jend 41c96caaccSSatish Balay jdx = 4*aj(j) 42c96caaccSSatish Balay 43*ff45ff59SMartin Diehl s(0) = s(0)-(a(ax+0)*x(jdx+0)+a(ax+4)*x(jdx+1)+a(ax+ 8)*x(jdx+2)+a(ax+12)*x(jdx+3)) 44*ff45ff59SMartin Diehl s(1) = s(1)-(a(ax+1)*x(jdx+0)+a(ax+5)*x(jdx+1)+a(ax+ 9)*x(jdx+2)+a(ax+13)*x(jdx+3)) 45*ff45ff59SMartin Diehl s(2) = s(2)-(a(ax+2)*x(jdx+0)+a(ax+6)*x(jdx+1)+a(ax+10)*x(jdx+2)+a(ax+14)*x(jdx+3)) 46*ff45ff59SMartin Diehl s(3) = s(3)-(a(ax+3)*x(jdx+0)+a(ax+7)*x(jdx+1)+a(ax+11)*x(jdx+2)+a(ax+15)*x(jdx+3)) 47c96caaccSSatish Balay ax = ax + 16 480113e719SMartin Diehl end do 49*ff45ff59SMartin Diehl x(idx+0:idx+3) = s(0:3) 500113e719SMartin Diehl end do 51c96caaccSSatish Balay 52c96caaccSSatish Balay ! 53c96caaccSSatish Balay ! Backward solve the upper triangular 54c96caaccSSatish Balay ! 550113e719SMartin Diehl do i=n-1,0,-1 56c96caaccSSatish Balay jstart = adiag(i) + 1 57c96caaccSSatish Balay jend = ai(i+1) - 1 58c96caaccSSatish Balay ax = 16*jstart 59*ff45ff59SMartin Diehl s(0:3) = x(idx+0:idx+3) 600113e719SMartin Diehl do j=jstart,jend 61c96caaccSSatish Balay jdx = 4*aj(j) 62*ff45ff59SMartin Diehl s(0) = s(0)-(a(ax+0)*x(jdx+0)+a(ax+4)*x(jdx+1)+a(ax+ 8)*x(jdx+2)+a(ax+12)*x(jdx+3)) 63*ff45ff59SMartin Diehl s(1) = s(1)-(a(ax+1)*x(jdx+0)+a(ax+5)*x(jdx+1)+a(ax+ 9)*x(jdx+2)+a(ax+13)*x(jdx+3)) 64*ff45ff59SMartin Diehl s(2) = s(2)-(a(ax+2)*x(jdx+0)+a(ax+6)*x(jdx+1)+a(ax+10)*x(jdx+2)+a(ax+14)*x(jdx+3)) 65*ff45ff59SMartin Diehl s(3) = s(3)-(a(ax+3)*x(jdx+0)+a(ax+7)*x(jdx+1)+a(ax+11)*x(jdx+2)+a(ax+15)*x(jdx+3)) 66c96caaccSSatish Balay ax = ax + 16 670113e719SMartin Diehl end do 68c96caaccSSatish Balay ax = 16*adiag(i) 69*ff45ff59SMartin Diehl x(idx+0) = a(ax+0)*s(0)+a(ax+4)*s(1)+a(ax+ 8)*s(2)+a(ax+12)*s(3) 70*ff45ff59SMartin Diehl x(idx+1) = a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+ 9)*s(2)+a(ax+13)*s(3) 71*ff45ff59SMartin Diehl x(idx+2) = a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3) 72*ff45ff59SMartin Diehl x(idx+3) = a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3) 73c96caaccSSatish Balay idx = idx - 4 740113e719SMartin Diehl end do 750113e719SMartin Diehlend subroutine FortranSolveBAIJ4Unroll 76c96caaccSSatish Balay 77c96caaccSSatish Balay! version that does not call BLAS 2 operation for each row block 78c96caaccSSatish Balay! 79*ff45ff59SMartin Diehlpure subroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w) 80c96caaccSSatish Balay implicit none 810ccf82acSMartin Diehl MatScalar, intent(in) :: a(0:*) 820ccf82acSMartin Diehl PetscScalar, intent(inout) :: x(0:*),w(0:*) 830ccf82acSMartin Diehl PetscScalar, intent(in) :: b(0:*) 840ccf82acSMartin Diehl PetscInt, intent(in) :: n 850ccf82acSMartin Diehl PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*) 86c96caaccSSatish Balay 870ccf82acSMartin Diehl PetscInt :: ii,jj,i,j 880ccf82acSMartin Diehl PetscInt :: jstart,jend,idx,ax,jdx,kdx,nn 890ccf82acSMartin Diehl PetscScalar :: s(0:3) 90c96caaccSSatish Balay 91c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 92c96caaccSSatish Balay PETSC_AssertAlignx(16,w(1)) 93c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 94c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 95c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 96c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 97c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 980113e719SMartin Diehl ! 990113e719SMartin Diehl ! Forward Solve 1000113e719SMartin Diehl ! 101d66e387eSMartin Diehl x(0:3) = b(0:3) 102c96caaccSSatish Balay idx = 0 1030113e719SMartin Diehl do i=1,n-1 104c96caaccSSatish Balay ! 105c96caaccSSatish Balay ! Pack required part of vector into work array 106c96caaccSSatish Balay ! 107c96caaccSSatish Balay kdx = 0 108c96caaccSSatish Balay jstart = ai(i) 109c96caaccSSatish Balay jend = adiag(i) - 1 110d66e387eSMartin Diehl 111*ff45ff59SMartin Diehl if (jend - jstart >= 500) error stop 'Overflowing vector FortranSolveBAIJ4()' 112d66e387eSMartin Diehl 1130113e719SMartin Diehl do j=jstart,jend 114c96caaccSSatish Balay jdx = 4*aj(j) 115d66e387eSMartin Diehl w(kdx:kdx+3) = x(jdx:jdx+3) 116c96caaccSSatish Balay kdx = kdx + 4 1170113e719SMartin Diehl end do 118c96caaccSSatish Balay 119c96caaccSSatish Balay ax = 16*jstart 120c96caaccSSatish Balay idx = idx + 4 121d66e387eSMartin Diehl s(0:3) = b(idx:idx+3) 122c96caaccSSatish Balay ! 123c96caaccSSatish Balay ! s = s - a(ax:)*w 124c96caaccSSatish Balay ! 125c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 1260113e719SMartin Diehl do ii=0,3 1270113e719SMartin Diehl do jj=0,nn 128c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 1290113e719SMartin Diehl end do 1300113e719SMartin Diehl end do 131c96caaccSSatish Balay 132d66e387eSMartin Diehl x(idx:idx+3) = s(0:3) 1330113e719SMartin Diehl end do 134c96caaccSSatish Balay ! 135c96caaccSSatish Balay ! Backward solve the upper triangular 136c96caaccSSatish Balay ! 1370113e719SMartin Diehl do i=n-1,0,-1 138c96caaccSSatish Balay jstart = adiag(i) + 1 139c96caaccSSatish Balay jend = ai(i+1) - 1 140c96caaccSSatish Balay ax = 16*jstart 141d66e387eSMartin Diehl s(0:3) = x(idx:idx+3) 142c96caaccSSatish Balay ! 143c96caaccSSatish Balay ! Pack each chunk of vector needed 144c96caaccSSatish Balay ! 145c96caaccSSatish Balay kdx = 0 146*ff45ff59SMartin Diehl if (jend - jstart >= 500) error stop 'Overflowing vector FortranSolveBAIJ4()' 147d66e387eSMartin Diehl 1480113e719SMartin Diehl do j=jstart,jend 149c96caaccSSatish Balay jdx = 4*aj(j) 150d66e387eSMartin Diehl w(kdx:kdx+3) = x(jdx:jdx+3) 151c96caaccSSatish Balay kdx = kdx + 4 1520113e719SMartin Diehl end do 153c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 1540113e719SMartin Diehl do ii=0,3 1550113e719SMartin Diehl do jj=0,nn 156c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 1570113e719SMartin Diehl end do 1580113e719SMartin Diehl end do 159c96caaccSSatish Balay 160c96caaccSSatish Balay ax = 16*adiag(i) 161d66e387eSMartin Diehl x(idx) = a(ax+0)*s(0)+a(ax+4)*s(1)+a(ax+ 8)*s(2)+a(ax+12)*s(3) 162c96caaccSSatish Balay x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+ 9)*s(2)+a(ax+13)*s(3) 163c96caaccSSatish Balay x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3) 164c96caaccSSatish Balay x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3) 165c96caaccSSatish Balay idx = idx - 4 1660113e719SMartin Diehl end do 1670113e719SMartin Diehlend subroutine FortranSolveBAIJ4 168