1c96caaccSSatish Balay! 2c96caaccSSatish Balay! 3c96caaccSSatish Balay! Fortran kernel for sparse triangular solve in the BAIJ matrix format 4c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e. 5c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering() 6c96caaccSSatish Balay! 7c96caaccSSatish Balay#include <petsc/finclude/petscsys.h> 8c96caaccSSatish Balay! 9c96caaccSSatish Balay 100ccf82acSMartin Diehlpure subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b) 110ccf82acSMartin Diehl implicit none (type, external) 120ccf82acSMartin Diehl MatScalar, intent(in) :: a(0:*) 130ccf82acSMartin Diehl PetscScalar, intent(inout) :: x(0:*) 140ccf82acSMartin Diehl PetscScalar, intent(in) :: b(0:*) 150ccf82acSMartin Diehl PetscInt, intent(in) :: n 160ccf82acSMartin Diehl PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*) 17c96caaccSSatish Balay 180ccf82acSMartin Diehl PetscInt :: i,j,jstart,jend 190ccf82acSMartin Diehl PetscInt :: idx,ax,jdx 200ccf82acSMartin Diehl PetscScalar :: s1,s2,s3,s4 210ccf82acSMartin Diehl PetscScalar :: x1,x2,x3,x4 220ccf82acSMartin Diehl 23c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 24c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 25c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 26c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 27c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 28c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 29c96caaccSSatish Balay 300ccf82acSMartin Diehl ! 310ccf82acSMartin Diehl ! Forward Solve 320ccf82acSMartin Diehl ! 33*d66e387eSMartin Diehl x(0:3) = b(0:3) 34c96caaccSSatish Balay idx = 0 350113e719SMartin Diehl do i=1,n-1 36c96caaccSSatish Balay jstart = ai(i) 37c96caaccSSatish Balay jend = adiag(i) - 1 38c96caaccSSatish Balay ax = 16*jstart 39c96caaccSSatish Balay idx = idx + 4 40*d66e387eSMartin Diehl s1 = b(idx+0) 41c96caaccSSatish Balay s2 = b(idx+1) 42c96caaccSSatish Balay s3 = b(idx+2) 43c96caaccSSatish Balay s4 = b(idx+3) 440113e719SMartin Diehl do j=jstart,jend 45c96caaccSSatish Balay jdx = 4*aj(j) 46c96caaccSSatish Balay 47*d66e387eSMartin Diehl x1 = x(jdx+0) 48c96caaccSSatish Balay x2 = x(jdx+1) 49c96caaccSSatish Balay x3 = x(jdx+2) 50c96caaccSSatish Balay x4 = x(jdx+3) 51*d66e387eSMartin Diehl s1 = s1-(a(ax+0)*x1+a(ax+4)*x2+a(ax+ 8)*x3+a(ax+12)*x4) 52c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 53c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 54c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 55c96caaccSSatish Balay ax = ax + 16 560113e719SMartin Diehl end do 57*d66e387eSMartin Diehl x(idx+0) = s1 58c96caaccSSatish Balay x(idx+1) = s2 59c96caaccSSatish Balay x(idx+2) = s3 60c96caaccSSatish Balay x(idx+3) = s4 610113e719SMartin Diehl end do 62c96caaccSSatish Balay 63c96caaccSSatish Balay ! 64c96caaccSSatish Balay ! Backward solve the upper triangular 65c96caaccSSatish Balay ! 660113e719SMartin Diehl do i=n-1,0,-1 67c96caaccSSatish Balay jstart = adiag(i) + 1 68c96caaccSSatish Balay jend = ai(i+1) - 1 69c96caaccSSatish Balay ax = 16*jstart 70*d66e387eSMartin Diehl s1 = x(idx+0) 71c96caaccSSatish Balay s2 = x(idx+1) 72c96caaccSSatish Balay s3 = x(idx+2) 73c96caaccSSatish Balay s4 = x(idx+3) 740113e719SMartin Diehl do j=jstart,jend 75c96caaccSSatish Balay jdx = 4*aj(j) 76*d66e387eSMartin Diehl x1 = x(jdx+0) 77c96caaccSSatish Balay x2 = x(jdx+1) 78c96caaccSSatish Balay x3 = x(jdx+2) 79c96caaccSSatish Balay x4 = x(jdx+3) 80*d66e387eSMartin Diehl s1 = s1-(a(ax+0)*x1+a(ax+4)*x2+a(ax+ 8)*x3+a(ax+12)*x4) 81c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 82c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 83c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 84c96caaccSSatish Balay ax = ax + 16 850113e719SMartin Diehl end do 86c96caaccSSatish Balay ax = 16*adiag(i) 87*d66e387eSMartin Diehl x(idx+0) = a(ax+0)*s1+a(ax+4)*s2+a(ax+ 8)*s3+a(ax+12)*s4 88c96caaccSSatish Balay x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4 89c96caaccSSatish Balay x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4 90c96caaccSSatish Balay x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4 91c96caaccSSatish Balay idx = idx - 4 920113e719SMartin Diehl end do 930113e719SMartin Diehlend subroutine FortranSolveBAIJ4Unroll 94c96caaccSSatish Balay 95c96caaccSSatish Balay! version that does not call BLAS 2 operation for each row block 96c96caaccSSatish Balay! 97c96caaccSSatish Balaysubroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w) 98c96caaccSSatish Balay implicit none 990ccf82acSMartin Diehl MatScalar, intent(in) :: a(0:*) 1000ccf82acSMartin Diehl PetscScalar, intent(inout) :: x(0:*),w(0:*) 1010ccf82acSMartin Diehl PetscScalar, intent(in) :: b(0:*) 1020ccf82acSMartin Diehl PetscInt, intent(in) :: n 1030ccf82acSMartin Diehl PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*) 104c96caaccSSatish Balay 1050ccf82acSMartin Diehl PetscInt :: ii,jj,i,j 1060ccf82acSMartin Diehl PetscInt :: jstart,jend,idx,ax,jdx,kdx,nn 1070ccf82acSMartin Diehl PetscScalar :: s(0:3) 108c96caaccSSatish Balay 109c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 110c96caaccSSatish Balay PETSC_AssertAlignx(16,w(1)) 111c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 112c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 113c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 114c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 115c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 1160113e719SMartin Diehl ! 1170113e719SMartin Diehl ! Forward Solve 1180113e719SMartin Diehl ! 119*d66e387eSMartin Diehl x(0:3) = b(0:3) 120c96caaccSSatish Balay idx = 0 1210113e719SMartin Diehl do i=1,n-1 122c96caaccSSatish Balay ! 123c96caaccSSatish Balay ! Pack required part of vector into work array 124c96caaccSSatish Balay ! 125c96caaccSSatish Balay kdx = 0 126c96caaccSSatish Balay jstart = ai(i) 127c96caaccSSatish Balay jend = adiag(i) - 1 128*d66e387eSMartin Diehl 129*d66e387eSMartin Diehl if (jend - jstart >= 500) write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 130*d66e387eSMartin Diehl 1310113e719SMartin Diehl do j=jstart,jend 132c96caaccSSatish Balay 133c96caaccSSatish Balay jdx = 4*aj(j) 134*d66e387eSMartin Diehl w(kdx:kdx+3) = x(jdx:jdx+3) 135c96caaccSSatish Balay kdx = kdx + 4 1360113e719SMartin Diehl end do 137c96caaccSSatish Balay 138c96caaccSSatish Balay ax = 16*jstart 139c96caaccSSatish Balay idx = idx + 4 140*d66e387eSMartin Diehl s(0:3) = b(idx:idx+3) 141c96caaccSSatish Balay ! 142c96caaccSSatish Balay ! s = s - a(ax:)*w 143c96caaccSSatish Balay ! 144c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 1450113e719SMartin Diehl do ii=0,3 1460113e719SMartin Diehl do jj=0,nn 147c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 1480113e719SMartin Diehl end do 1490113e719SMartin Diehl end do 150c96caaccSSatish Balay 151*d66e387eSMartin Diehl x(idx:idx+3) = s(0:3) 1520113e719SMartin Diehl end do 153c96caaccSSatish Balay ! 154c96caaccSSatish Balay ! Backward solve the upper triangular 155c96caaccSSatish Balay ! 1560113e719SMartin Diehl do i=n-1,0,-1 157c96caaccSSatish Balay jstart = adiag(i) + 1 158c96caaccSSatish Balay jend = ai(i+1) - 1 159c96caaccSSatish Balay ax = 16*jstart 160*d66e387eSMartin Diehl s(0:3) = x(idx:idx+3) 161c96caaccSSatish Balay ! 162c96caaccSSatish Balay ! Pack each chunk of vector needed 163c96caaccSSatish Balay ! 164c96caaccSSatish Balay kdx = 0 165*d66e387eSMartin Diehl if (jend - jstart >= 500) write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 166*d66e387eSMartin Diehl 1670113e719SMartin Diehl do j=jstart,jend 168c96caaccSSatish Balay jdx = 4*aj(j) 169*d66e387eSMartin Diehl w(kdx:kdx+3) = x(jdx:jdx+3) 170c96caaccSSatish Balay kdx = kdx + 4 1710113e719SMartin Diehl end do 172c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 1730113e719SMartin Diehl do ii=0,3 1740113e719SMartin Diehl do jj=0,nn 175c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 1760113e719SMartin Diehl end do 1770113e719SMartin Diehl end do 178c96caaccSSatish Balay 179c96caaccSSatish Balay ax = 16*adiag(i) 180*d66e387eSMartin Diehl x(idx) = a(ax+0)*s(0)+a(ax+4)*s(1)+a(ax+ 8)*s(2)+a(ax+12)*s(3) 181c96caaccSSatish Balay x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3) 182c96caaccSSatish Balay x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3) 183c96caaccSSatish Balay x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3) 184c96caaccSSatish Balay idx = idx - 4 1850113e719SMartin Diehl end do 1860113e719SMartin Diehlend subroutine FortranSolveBAIJ4 187