1c96caaccSSatish Balay! 2c96caaccSSatish Balay! 3c96caaccSSatish Balay! Fortran kernel for sparse triangular solve in the BAIJ matrix format 4c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e. 5c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering() 6c96caaccSSatish Balay! 7c96caaccSSatish Balay#include <petsc/finclude/petscsys.h> 8c96caaccSSatish Balay! 9c96caaccSSatish Balay 10*0ccf82acSMartin Diehlpure subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b) 11*0ccf82acSMartin Diehl implicit none (type, external) 12*0ccf82acSMartin Diehl MatScalar, intent(in) :: a(0:*) 13*0ccf82acSMartin Diehl PetscScalar, intent(inout) :: x(0:*) 14*0ccf82acSMartin Diehl PetscScalar, intent(in) :: b(0:*) 15*0ccf82acSMartin Diehl PetscInt, intent(in) :: n 16*0ccf82acSMartin Diehl PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*) 17c96caaccSSatish Balay 18*0ccf82acSMartin Diehl PetscInt :: i,j,jstart,jend 19*0ccf82acSMartin Diehl PetscInt :: idx,ax,jdx 20*0ccf82acSMartin Diehl PetscScalar :: s1,s2,s3,s4 21*0ccf82acSMartin Diehl PetscScalar :: x1,x2,x3,x4 22*0ccf82acSMartin Diehl 23c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 24c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 25c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 26c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 27c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 28c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 29c96caaccSSatish Balay 30*0ccf82acSMartin Diehl ! 31*0ccf82acSMartin Diehl ! Forward Solve 32*0ccf82acSMartin Diehl ! 33c96caaccSSatish Balay x(0) = b(0) 34c96caaccSSatish Balay x(1) = b(1) 35c96caaccSSatish Balay x(2) = b(2) 36c96caaccSSatish Balay x(3) = b(3) 37c96caaccSSatish Balay idx = 0 380113e719SMartin Diehl do i=1,n-1 39c96caaccSSatish Balay jstart = ai(i) 40c96caaccSSatish Balay jend = adiag(i) - 1 41c96caaccSSatish Balay ax = 16*jstart 42c96caaccSSatish Balay idx = idx + 4 43c96caaccSSatish Balay s1 = b(idx) 44c96caaccSSatish Balay s2 = b(idx+1) 45c96caaccSSatish Balay s3 = b(idx+2) 46c96caaccSSatish Balay s4 = b(idx+3) 470113e719SMartin Diehl do j=jstart,jend 48c96caaccSSatish Balay jdx = 4*aj(j) 49c96caaccSSatish Balay 50c96caaccSSatish Balay x1 = x(jdx) 51c96caaccSSatish Balay x2 = x(jdx+1) 52c96caaccSSatish Balay x3 = x(jdx+2) 53c96caaccSSatish Balay x4 = x(jdx+3) 54c96caaccSSatish Balay s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4) 55c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 56c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 57c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 58c96caaccSSatish Balay ax = ax + 16 590113e719SMartin Diehl end do 60c96caaccSSatish Balay x(idx) = s1 61c96caaccSSatish Balay x(idx+1) = s2 62c96caaccSSatish Balay x(idx+2) = s3 63c96caaccSSatish Balay x(idx+3) = s4 640113e719SMartin Diehl end do 65c96caaccSSatish Balay 66c96caaccSSatish Balay ! 67c96caaccSSatish Balay ! Backward solve the upper triangular 68c96caaccSSatish Balay ! 690113e719SMartin Diehl do i=n-1,0,-1 70c96caaccSSatish Balay jstart = adiag(i) + 1 71c96caaccSSatish Balay jend = ai(i+1) - 1 72c96caaccSSatish Balay ax = 16*jstart 73c96caaccSSatish Balay s1 = x(idx) 74c96caaccSSatish Balay s2 = x(idx+1) 75c96caaccSSatish Balay s3 = x(idx+2) 76c96caaccSSatish Balay s4 = x(idx+3) 770113e719SMartin Diehl do j=jstart,jend 78c96caaccSSatish Balay jdx = 4*aj(j) 79c96caaccSSatish Balay x1 = x(jdx) 80c96caaccSSatish Balay x2 = x(jdx+1) 81c96caaccSSatish Balay x3 = x(jdx+2) 82c96caaccSSatish Balay x4 = x(jdx+3) 83c96caaccSSatish Balay s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4) 84c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 85c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 86c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 87c96caaccSSatish Balay ax = ax + 16 880113e719SMartin Diehl end do 89c96caaccSSatish Balay ax = 16*adiag(i) 90c96caaccSSatish Balay x(idx) = a(ax)*s1 +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4 91c96caaccSSatish Balay x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4 92c96caaccSSatish Balay x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4 93c96caaccSSatish Balay x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4 94c96caaccSSatish Balay idx = idx - 4 950113e719SMartin Diehl end do 960113e719SMartin Diehlend subroutine FortranSolveBAIJ4Unroll 97c96caaccSSatish Balay 98c96caaccSSatish Balay! version that does not call BLAS 2 operation for each row block 99c96caaccSSatish Balay! 100c96caaccSSatish Balaysubroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w) 101c96caaccSSatish Balay implicit none 102*0ccf82acSMartin Diehl MatScalar, intent(in) :: a(0:*) 103*0ccf82acSMartin Diehl PetscScalar, intent(inout) :: x(0:*),w(0:*) 104*0ccf82acSMartin Diehl PetscScalar, intent(in) :: b(0:*) 105*0ccf82acSMartin Diehl PetscInt, intent(in) :: n 106*0ccf82acSMartin Diehl PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*) 107c96caaccSSatish Balay 108*0ccf82acSMartin Diehl PetscInt :: ii,jj,i,j 109*0ccf82acSMartin Diehl PetscInt :: jstart,jend,idx,ax,jdx,kdx,nn 110*0ccf82acSMartin Diehl PetscScalar :: s(0:3) 111c96caaccSSatish Balay 112c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 113c96caaccSSatish Balay PETSC_AssertAlignx(16,w(1)) 114c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 115c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 116c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 117c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 118c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 1190113e719SMartin Diehl ! 1200113e719SMartin Diehl ! Forward Solve 1210113e719SMartin Diehl ! 122c96caaccSSatish Balay x(0) = b(0) 123c96caaccSSatish Balay x(1) = b(1) 124c96caaccSSatish Balay x(2) = b(2) 125c96caaccSSatish Balay x(3) = b(3) 126c96caaccSSatish Balay idx = 0 1270113e719SMartin Diehl do i=1,n-1 128c96caaccSSatish Balay ! 129c96caaccSSatish Balay ! Pack required part of vector into work array 130c96caaccSSatish Balay ! 131c96caaccSSatish Balay kdx = 0 132c96caaccSSatish Balay jstart = ai(i) 133c96caaccSSatish Balay jend = adiag(i) - 1 134c96caaccSSatish Balay if (jend - jstart .ge. 500) then 135c96caaccSSatish Balay write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 136c96caaccSSatish Balay endif 1370113e719SMartin Diehl do j=jstart,jend 138c96caaccSSatish Balay 139c96caaccSSatish Balay jdx = 4*aj(j) 140c96caaccSSatish Balay 141c96caaccSSatish Balay w(kdx) = x(jdx) 142c96caaccSSatish Balay w(kdx+1) = x(jdx+1) 143c96caaccSSatish Balay w(kdx+2) = x(jdx+2) 144c96caaccSSatish Balay w(kdx+3) = x(jdx+3) 145c96caaccSSatish Balay kdx = kdx + 4 1460113e719SMartin Diehl end do 147c96caaccSSatish Balay 148c96caaccSSatish Balay ax = 16*jstart 149c96caaccSSatish Balay idx = idx + 4 150c96caaccSSatish Balay s(0) = b(idx) 151c96caaccSSatish Balay s(1) = b(idx+1) 152c96caaccSSatish Balay s(2) = b(idx+2) 153c96caaccSSatish Balay s(3) = b(idx+3) 154c96caaccSSatish Balay ! 155c96caaccSSatish Balay ! s = s - a(ax:)*w 156c96caaccSSatish Balay ! 157c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 1580113e719SMartin Diehl do ii=0,3 1590113e719SMartin Diehl do jj=0,nn 160c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 1610113e719SMartin Diehl end do 1620113e719SMartin Diehl end do 163c96caaccSSatish Balay 164c96caaccSSatish Balay x(idx) = s(0) 165c96caaccSSatish Balay x(idx+1) = s(1) 166c96caaccSSatish Balay x(idx+2) = s(2) 167c96caaccSSatish Balay x(idx+3) = s(3) 1680113e719SMartin Diehl end do 169c96caaccSSatish Balay ! 170c96caaccSSatish Balay ! Backward solve the upper triangular 171c96caaccSSatish Balay ! 1720113e719SMartin Diehl do i=n-1,0,-1 173c96caaccSSatish Balay jstart = adiag(i) + 1 174c96caaccSSatish Balay jend = ai(i+1) - 1 175c96caaccSSatish Balay ax = 16*jstart 176c96caaccSSatish Balay s(0) = x(idx) 177c96caaccSSatish Balay s(1) = x(idx+1) 178c96caaccSSatish Balay s(2) = x(idx+2) 179c96caaccSSatish Balay s(3) = x(idx+3) 180c96caaccSSatish Balay ! 181c96caaccSSatish Balay ! Pack each chunk of vector needed 182c96caaccSSatish Balay ! 183c96caaccSSatish Balay kdx = 0 184c96caaccSSatish Balay if (jend - jstart .ge. 500) then 185c96caaccSSatish Balay write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 186c96caaccSSatish Balay endif 1870113e719SMartin Diehl do j=jstart,jend 188c96caaccSSatish Balay jdx = 4*aj(j) 189c96caaccSSatish Balay w(kdx) = x(jdx) 190c96caaccSSatish Balay w(kdx+1) = x(jdx+1) 191c96caaccSSatish Balay w(kdx+2) = x(jdx+2) 192c96caaccSSatish Balay w(kdx+3) = x(jdx+3) 193c96caaccSSatish Balay kdx = kdx + 4 1940113e719SMartin Diehl end do 195c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 1960113e719SMartin Diehl do ii=0,3 1970113e719SMartin Diehl do jj=0,nn 198c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 1990113e719SMartin Diehl end do 2000113e719SMartin Diehl end do 201c96caaccSSatish Balay 202c96caaccSSatish Balay ax = 16*adiag(i) 203c96caaccSSatish Balay x(idx) = a(ax)*s(0) +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3) 204c96caaccSSatish Balay x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3) 205c96caaccSSatish Balay x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3) 206c96caaccSSatish Balay x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3) 207c96caaccSSatish Balay idx = idx - 4 2080113e719SMartin Diehl end do 2090113e719SMartin Diehlend subroutine FortranSolveBAIJ4 210