1*c96caaccSSatish Balay! 2*c96caaccSSatish Balay! 3*c96caaccSSatish Balay! Fortran kernel for sparse triangular solve in the BAIJ matrix format 4*c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e. 5*c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering() 6*c96caaccSSatish Balay! 7*c96caaccSSatish Balay#include <petsc/finclude/petscsys.h> 8*c96caaccSSatish Balay! 9*c96caaccSSatish Balay 10*c96caaccSSatish Balay subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b) 11*c96caaccSSatish Balay implicit none 12*c96caaccSSatish Balay MatScalar a(0:*) 13*c96caaccSSatish Balay PetscScalar x(0:*) 14*c96caaccSSatish Balay PetscScalar b(0:*) 15*c96caaccSSatish Balay PetscInt n 16*c96caaccSSatish Balay PetscInt ai(0:*) 17*c96caaccSSatish Balay PetscInt aj(0:*) 18*c96caaccSSatish Balay PetscInt adiag(0:*) 19*c96caaccSSatish Balay 20*c96caaccSSatish Balay PetscInt i,j,jstart,jend 21*c96caaccSSatish Balay PetscInt idx,ax,jdx 22*c96caaccSSatish Balay PetscScalar s1,s2,s3,s4 23*c96caaccSSatish Balay PetscScalar x1,x2,x3,x4 24*c96caaccSSatish Balay! 25*c96caaccSSatish Balay! Forward Solve 26*c96caaccSSatish Balay! 27*c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 28*c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 29*c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 30*c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 31*c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 32*c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 33*c96caaccSSatish Balay 34*c96caaccSSatish Balay x(0) = b(0) 35*c96caaccSSatish Balay x(1) = b(1) 36*c96caaccSSatish Balay x(2) = b(2) 37*c96caaccSSatish Balay x(3) = b(3) 38*c96caaccSSatish Balay idx = 0 39*c96caaccSSatish Balay do 20 i=1,n-1 40*c96caaccSSatish Balay jstart = ai(i) 41*c96caaccSSatish Balay jend = adiag(i) - 1 42*c96caaccSSatish Balay ax = 16*jstart 43*c96caaccSSatish Balay idx = idx + 4 44*c96caaccSSatish Balay s1 = b(idx) 45*c96caaccSSatish Balay s2 = b(idx+1) 46*c96caaccSSatish Balay s3 = b(idx+2) 47*c96caaccSSatish Balay s4 = b(idx+3) 48*c96caaccSSatish Balay do 30 j=jstart,jend 49*c96caaccSSatish Balay jdx = 4*aj(j) 50*c96caaccSSatish Balay 51*c96caaccSSatish Balay x1 = x(jdx) 52*c96caaccSSatish Balay x2 = x(jdx+1) 53*c96caaccSSatish Balay x3 = x(jdx+2) 54*c96caaccSSatish Balay x4 = x(jdx+3) 55*c96caaccSSatish Balay s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4) 56*c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 57*c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 58*c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 59*c96caaccSSatish Balay ax = ax + 16 60*c96caaccSSatish Balay 30 continue 61*c96caaccSSatish Balay x(idx) = s1 62*c96caaccSSatish Balay x(idx+1) = s2 63*c96caaccSSatish Balay x(idx+2) = s3 64*c96caaccSSatish Balay x(idx+3) = s4 65*c96caaccSSatish Balay 20 continue 66*c96caaccSSatish Balay 67*c96caaccSSatish Balay 68*c96caaccSSatish Balay! 69*c96caaccSSatish Balay! Backward solve the upper triangular 70*c96caaccSSatish Balay! 71*c96caaccSSatish Balay do 40 i=n-1,0,-1 72*c96caaccSSatish Balay jstart = adiag(i) + 1 73*c96caaccSSatish Balay jend = ai(i+1) - 1 74*c96caaccSSatish Balay ax = 16*jstart 75*c96caaccSSatish Balay s1 = x(idx) 76*c96caaccSSatish Balay s2 = x(idx+1) 77*c96caaccSSatish Balay s3 = x(idx+2) 78*c96caaccSSatish Balay s4 = x(idx+3) 79*c96caaccSSatish Balay do 50 j=jstart,jend 80*c96caaccSSatish Balay jdx = 4*aj(j) 81*c96caaccSSatish Balay x1 = x(jdx) 82*c96caaccSSatish Balay x2 = x(jdx+1) 83*c96caaccSSatish Balay x3 = x(jdx+2) 84*c96caaccSSatish Balay x4 = x(jdx+3) 85*c96caaccSSatish Balay s1 = s1-(a(ax)*x1 +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4) 86*c96caaccSSatish Balay s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4) 87*c96caaccSSatish Balay s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4) 88*c96caaccSSatish Balay s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4) 89*c96caaccSSatish Balay ax = ax + 16 90*c96caaccSSatish Balay 50 continue 91*c96caaccSSatish Balay ax = 16*adiag(i) 92*c96caaccSSatish Balay x(idx) = a(ax)*s1 +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4 93*c96caaccSSatish Balay x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4 94*c96caaccSSatish Balay x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4 95*c96caaccSSatish Balay x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4 96*c96caaccSSatish Balay idx = idx - 4 97*c96caaccSSatish Balay 40 continue 98*c96caaccSSatish Balay return 99*c96caaccSSatish Balay end 100*c96caaccSSatish Balay 101*c96caaccSSatish Balay! version that does not call BLAS 2 operation for each row block 102*c96caaccSSatish Balay! 103*c96caaccSSatish Balay subroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w) 104*c96caaccSSatish Balay implicit none 105*c96caaccSSatish Balay MatScalar a(0:*) 106*c96caaccSSatish Balay PetscScalar x(0:*),b(0:*),w(0:*) 107*c96caaccSSatish Balay PetscInt n,ai(0:*),aj(0:*),adiag(0:*) 108*c96caaccSSatish Balay PetscInt ii,jj,i,j 109*c96caaccSSatish Balay 110*c96caaccSSatish Balay PetscInt jstart,jend,idx,ax,jdx,kdx,nn 111*c96caaccSSatish Balay PetscScalar s(0:3) 112*c96caaccSSatish Balay 113*c96caaccSSatish Balay! 114*c96caaccSSatish Balay! Forward Solve 115*c96caaccSSatish Balay! 116*c96caaccSSatish Balay 117*c96caaccSSatish Balay PETSC_AssertAlignx(16,a(1)) 118*c96caaccSSatish Balay PETSC_AssertAlignx(16,w(1)) 119*c96caaccSSatish Balay PETSC_AssertAlignx(16,x(1)) 120*c96caaccSSatish Balay PETSC_AssertAlignx(16,b(1)) 121*c96caaccSSatish Balay PETSC_AssertAlignx(16,ai(1)) 122*c96caaccSSatish Balay PETSC_AssertAlignx(16,aj(1)) 123*c96caaccSSatish Balay PETSC_AssertAlignx(16,adiag(1)) 124*c96caaccSSatish Balay 125*c96caaccSSatish Balay x(0) = b(0) 126*c96caaccSSatish Balay x(1) = b(1) 127*c96caaccSSatish Balay x(2) = b(2) 128*c96caaccSSatish Balay x(3) = b(3) 129*c96caaccSSatish Balay idx = 0 130*c96caaccSSatish Balay do 20 i=1,n-1 131*c96caaccSSatish Balay! 132*c96caaccSSatish Balay! Pack required part of vector into work array 133*c96caaccSSatish Balay! 134*c96caaccSSatish Balay kdx = 0 135*c96caaccSSatish Balay jstart = ai(i) 136*c96caaccSSatish Balay jend = adiag(i) - 1 137*c96caaccSSatish Balay if (jend - jstart .ge. 500) then 138*c96caaccSSatish Balay write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 139*c96caaccSSatish Balay endif 140*c96caaccSSatish Balay do 30 j=jstart,jend 141*c96caaccSSatish Balay 142*c96caaccSSatish Balay jdx = 4*aj(j) 143*c96caaccSSatish Balay 144*c96caaccSSatish Balay w(kdx) = x(jdx) 145*c96caaccSSatish Balay w(kdx+1) = x(jdx+1) 146*c96caaccSSatish Balay w(kdx+2) = x(jdx+2) 147*c96caaccSSatish Balay w(kdx+3) = x(jdx+3) 148*c96caaccSSatish Balay kdx = kdx + 4 149*c96caaccSSatish Balay 30 continue 150*c96caaccSSatish Balay 151*c96caaccSSatish Balay ax = 16*jstart 152*c96caaccSSatish Balay idx = idx + 4 153*c96caaccSSatish Balay s(0) = b(idx) 154*c96caaccSSatish Balay s(1) = b(idx+1) 155*c96caaccSSatish Balay s(2) = b(idx+2) 156*c96caaccSSatish Balay s(3) = b(idx+3) 157*c96caaccSSatish Balay! 158*c96caaccSSatish Balay! s = s - a(ax:)*w 159*c96caaccSSatish Balay! 160*c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 161*c96caaccSSatish Balay do 100, ii=0,3 162*c96caaccSSatish Balay do 110, jj=0,nn 163*c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 164*c96caaccSSatish Balay 110 continue 165*c96caaccSSatish Balay 100 continue 166*c96caaccSSatish Balay 167*c96caaccSSatish Balay x(idx) = s(0) 168*c96caaccSSatish Balay x(idx+1) = s(1) 169*c96caaccSSatish Balay x(idx+2) = s(2) 170*c96caaccSSatish Balay x(idx+3) = s(3) 171*c96caaccSSatish Balay 20 continue 172*c96caaccSSatish Balay 173*c96caaccSSatish Balay! 174*c96caaccSSatish Balay! Backward solve the upper triangular 175*c96caaccSSatish Balay! 176*c96caaccSSatish Balay do 40 i=n-1,0,-1 177*c96caaccSSatish Balay jstart = adiag(i) + 1 178*c96caaccSSatish Balay jend = ai(i+1) - 1 179*c96caaccSSatish Balay ax = 16*jstart 180*c96caaccSSatish Balay s(0) = x(idx) 181*c96caaccSSatish Balay s(1) = x(idx+1) 182*c96caaccSSatish Balay s(2) = x(idx+2) 183*c96caaccSSatish Balay s(3) = x(idx+3) 184*c96caaccSSatish Balay! 185*c96caaccSSatish Balay! Pack each chunk of vector needed 186*c96caaccSSatish Balay! 187*c96caaccSSatish Balay kdx = 0 188*c96caaccSSatish Balay if (jend - jstart .ge. 500) then 189*c96caaccSSatish Balay write(6,*) 'Overflowing vector FortranSolveBAIJ4()' 190*c96caaccSSatish Balay endif 191*c96caaccSSatish Balay do 50 j=jstart,jend 192*c96caaccSSatish Balay jdx = 4*aj(j) 193*c96caaccSSatish Balay w(kdx) = x(jdx) 194*c96caaccSSatish Balay w(kdx+1) = x(jdx+1) 195*c96caaccSSatish Balay w(kdx+2) = x(jdx+2) 196*c96caaccSSatish Balay w(kdx+3) = x(jdx+3) 197*c96caaccSSatish Balay kdx = kdx + 4 198*c96caaccSSatish Balay 50 continue 199*c96caaccSSatish Balay nn = 4*(jend - jstart + 1) - 1 200*c96caaccSSatish Balay do 200, ii=0,3 201*c96caaccSSatish Balay do 210, jj=0,nn 202*c96caaccSSatish Balay s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj) 203*c96caaccSSatish Balay 210 continue 204*c96caaccSSatish Balay 200 continue 205*c96caaccSSatish Balay 206*c96caaccSSatish Balay ax = 16*adiag(i) 207*c96caaccSSatish Balay x(idx) = a(ax)*s(0) +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3) 208*c96caaccSSatish Balay x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3) 209*c96caaccSSatish Balay x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3) 210*c96caaccSSatish Balay x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3) 211*c96caaccSSatish Balay idx = idx - 4 212*c96caaccSSatish Balay 40 continue 213*c96caaccSSatish Balay 214*c96caaccSSatish Balay return 215*c96caaccSSatish Balay end 216*c96caaccSSatish Balay 217