1c96caaccSSatish Balay! 2c96caaccSSatish Balay! 3c96caaccSSatish Balay! Fortran kernel for sparse triangular solve in the BAIJ matrix format 4c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e. 5c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering() 6c96caaccSSatish Balay! 7c96caaccSSatish Balay#include <petsc/finclude/petscsys.h> 8c96caaccSSatish Balay! 9c96caaccSSatish Balay 10c96caaccSSatish Balaysubroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b) 11c96caaccSSatish Balay implicit none 12c96caaccSSatish Balay MatScalar a(0:*) 13c96caaccSSatish Balay PetscScalar x(0:*) 14c96caaccSSatish Balay PetscScalar b(0:*) 15c96caaccSSatish Balay PetscInt n 16c96caaccSSatish Balay PetscInt ai(0:*) 17c96caaccSSatish Balay PetscInt aj(0:*) 18c96caaccSSatish Balay PetscInt adiag(0:*) 19c96caaccSSatish Balay 20c96caaccSSatish Balay PetscInt i,j,jstart,jend 21c96caaccSSatish Balay PetscInt idx,ax,jdx 22c96caaccSSatish Balay PetscScalar s1,s2,s3,s4 23c96caaccSSatish Balay PetscScalar x1,x2,x3,x4 24c96caaccSSatish Balay ! 25c96caaccSSatish Balay ! Forward Solve 26c96caaccSSatish Balay ! 27c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 28c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 29c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 30c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 31c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 32c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 33c96caaccSSatish Balay 34c96caaccSSatish Balay x(0) = b(0) 35c96caaccSSatish Balay x(1) = b(1) 36c96caaccSSatish Balay x(2) = b(2) 37c96caaccSSatish Balay x(3) = b(3) 38c96caaccSSatish Balay idx = 0 39*0113e719SMartin Diehl do i=1,n-1 40c96caaccSSatish Balay jstart = ai(i) 41c96caaccSSatish Balay jend = adiag(i) - 1 42c96caaccSSatish Balay ax = 16*jstart 43c96caaccSSatish Balay idx = idx + 4 44c96caaccSSatish Balay s1 = b(idx) 45c96caaccSSatish Balay s2 = b(idx+1) 46c96caaccSSatish Balay s3 = b(idx+2) 47c96caaccSSatish Balay s4 = b(idx+3) 48*0113e719SMartin Diehl do j=jstart,jend 49c96caaccSSatish Balay jdx = 4*aj(j) 50c96caaccSSatish Balay 51c96caaccSSatish Balay x1 = x(jdx) 52c96caaccSSatish Balay x2 = x(jdx+1) 53c96caaccSSatish Balay x3 = x(jdx+2) 54c96caaccSSatish Balay x4 = x(jdx+3) 55c96caaccSSatish Balay s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4) 56c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 57c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 58c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 59c96caaccSSatish Balay ax = ax + 16 60*0113e719SMartin Diehl end do 61c96caaccSSatish Balay x(idx) = s1 62c96caaccSSatish Balay x(idx+1) = s2 63c96caaccSSatish Balay x(idx+2) = s3 64c96caaccSSatish Balay x(idx+3) = s4 65*0113e719SMartin Diehl end do 66c96caaccSSatish Balay 67c96caaccSSatish Balay ! 68c96caaccSSatish Balay ! Backward solve the upper triangular 69c96caaccSSatish Balay ! 70*0113e719SMartin Diehl do i=n-1,0,-1 71c96caaccSSatish Balay jstart = adiag(i) + 1 72c96caaccSSatish Balay jend = ai(i+1) - 1 73c96caaccSSatish Balay ax = 16*jstart 74c96caaccSSatish Balay s1 = x(idx) 75c96caaccSSatish Balay s2 = x(idx+1) 76c96caaccSSatish Balay s3 = x(idx+2) 77c96caaccSSatish Balay s4 = x(idx+3) 78*0113e719SMartin Diehl do j=jstart,jend 79c96caaccSSatish Balay jdx = 4*aj(j) 80c96caaccSSatish Balay x1 = x(jdx) 81c96caaccSSatish Balay x2 = x(jdx+1) 82c96caaccSSatish Balay x3 = x(jdx+2) 83c96caaccSSatish Balay x4 = x(jdx+3) 84c96caaccSSatish Balay s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4) 85c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 86c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 87c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 88c96caaccSSatish Balay ax = ax + 16 89*0113e719SMartin Diehl end do 90c96caaccSSatish Balay ax = 16*adiag(i) 91c96caaccSSatish Balay x(idx) = a(ax)*s1 +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4 92c96caaccSSatish Balay x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4 93c96caaccSSatish Balay x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4 94c96caaccSSatish Balay x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4 95c96caaccSSatish Balay idx = idx - 4 96*0113e719SMartin Diehl end do 97*0113e719SMartin Diehlend subroutine FortranSolveBAIJ4Unroll 98c96caaccSSatish Balay 99c96caaccSSatish Balay! version that does not call BLAS 2 operation for each row block 100c96caaccSSatish Balay! 101c96caaccSSatish Balaysubroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w) 102c96caaccSSatish Balay implicit none 103c96caaccSSatish Balay MatScalar a(0:*) 104c96caaccSSatish Balay PetscScalar x(0:*),b(0:*),w(0:*) 105c96caaccSSatish Balay PetscInt n,ai(0:*),aj(0:*),adiag(0:*) 106c96caaccSSatish Balay PetscInt ii,jj,i,j 107c96caaccSSatish Balay 108c96caaccSSatish Balay PetscInt jstart,jend,idx,ax,jdx,kdx,nn 109c96caaccSSatish Balay PetscScalar s(0:3) 110c96caaccSSatish Balay 111c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 112c96caaccSSatish Balay PETSC_AssertAlignx(16,w(1)) 113c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 114c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 115c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 116c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 117c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 118*0113e719SMartin Diehl ! 119*0113e719SMartin Diehl ! Forward Solve 120*0113e719SMartin Diehl ! 121c96caaccSSatish Balay x(0) = b(0) 122c96caaccSSatish Balay x(1) = b(1) 123c96caaccSSatish Balay x(2) = b(2) 124c96caaccSSatish Balay x(3) = b(3) 125c96caaccSSatish Balay idx = 0 126*0113e719SMartin Diehl do i=1,n-1 127c96caaccSSatish Balay ! 128c96caaccSSatish Balay ! Pack required part of vector into work array 129c96caaccSSatish Balay ! 130c96caaccSSatish Balay kdx = 0 131c96caaccSSatish Balay jstart = ai(i) 132c96caaccSSatish Balay jend = adiag(i) - 1 133c96caaccSSatish Balay if (jend - jstart .ge. 500) then 134c96caaccSSatish Balay write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 135c96caaccSSatish Balay endif 136*0113e719SMartin Diehl do j=jstart,jend 137c96caaccSSatish Balay 138c96caaccSSatish Balay jdx = 4*aj(j) 139c96caaccSSatish Balay 140c96caaccSSatish Balay w(kdx) = x(jdx) 141c96caaccSSatish Balay w(kdx+1) = x(jdx+1) 142c96caaccSSatish Balay w(kdx+2) = x(jdx+2) 143c96caaccSSatish Balay w(kdx+3) = x(jdx+3) 144c96caaccSSatish Balay kdx = kdx + 4 145*0113e719SMartin Diehl end do 146c96caaccSSatish Balay 147c96caaccSSatish Balay ax = 16*jstart 148c96caaccSSatish Balay idx = idx + 4 149c96caaccSSatish Balay s(0) = b(idx) 150c96caaccSSatish Balay s(1) = b(idx+1) 151c96caaccSSatish Balay s(2) = b(idx+2) 152c96caaccSSatish Balay s(3) = b(idx+3) 153c96caaccSSatish Balay ! 154c96caaccSSatish Balay ! s = s - a(ax:)*w 155c96caaccSSatish Balay ! 156c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 157*0113e719SMartin Diehl do ii=0,3 158*0113e719SMartin Diehl do jj=0,nn 159c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 160*0113e719SMartin Diehl end do 161*0113e719SMartin Diehl end do 162c96caaccSSatish Balay 163c96caaccSSatish Balay x(idx) = s(0) 164c96caaccSSatish Balay x(idx+1) = s(1) 165c96caaccSSatish Balay x(idx+2) = s(2) 166c96caaccSSatish Balay x(idx+3) = s(3) 167*0113e719SMartin Diehl end do 168c96caaccSSatish Balay ! 169c96caaccSSatish Balay ! Backward solve the upper triangular 170c96caaccSSatish Balay ! 171*0113e719SMartin Diehl do i=n-1,0,-1 172c96caaccSSatish Balay jstart = adiag(i) + 1 173c96caaccSSatish Balay jend = ai(i+1) - 1 174c96caaccSSatish Balay ax = 16*jstart 175c96caaccSSatish Balay s(0) = x(idx) 176c96caaccSSatish Balay s(1) = x(idx+1) 177c96caaccSSatish Balay s(2) = x(idx+2) 178c96caaccSSatish Balay s(3) = x(idx+3) 179c96caaccSSatish Balay ! 180c96caaccSSatish Balay ! Pack each chunk of vector needed 181c96caaccSSatish Balay ! 182c96caaccSSatish Balay kdx = 0 183c96caaccSSatish Balay if (jend - jstart .ge. 500) then 184c96caaccSSatish Balay write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 185c96caaccSSatish Balay endif 186*0113e719SMartin Diehl do j=jstart,jend 187c96caaccSSatish Balay jdx = 4*aj(j) 188c96caaccSSatish Balay w(kdx) = x(jdx) 189c96caaccSSatish Balay w(kdx+1) = x(jdx+1) 190c96caaccSSatish Balay w(kdx+2) = x(jdx+2) 191c96caaccSSatish Balay w(kdx+3) = x(jdx+3) 192c96caaccSSatish Balay kdx = kdx + 4 193*0113e719SMartin Diehl end do 194c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 195*0113e719SMartin Diehl do ii=0,3 196*0113e719SMartin Diehl do jj=0,nn 197c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 198*0113e719SMartin Diehl end do 199*0113e719SMartin Diehl end do 200c96caaccSSatish Balay 201c96caaccSSatish Balay ax = 16*adiag(i) 202c96caaccSSatish Balay x(idx) = a(ax)*s(0) +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3) 203c96caaccSSatish Balay x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3) 204c96caaccSSatish Balay x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3) 205c96caaccSSatish Balay x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3) 206c96caaccSSatish Balay idx = idx - 4 207*0113e719SMartin Diehl end do 208c96caaccSSatish Balay 209*0113e719SMartin Diehlend subroutine FortranSolveBAIJ4 210