xref: /petsc/src/mat/impls/baij/seq/ftn-kernels/fsolvebaij.F90 (revision 0ccf82ac36648ce52b79cfc8b55f689a1594b19a)
1c96caaccSSatish Balay!
2c96caaccSSatish Balay!
3c96caaccSSatish Balay!    Fortran kernel for sparse triangular solve in the BAIJ matrix format
4c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e.
5c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering()
6c96caaccSSatish Balay!
7c96caaccSSatish Balay#include <petsc/finclude/petscsys.h>
8c96caaccSSatish Balay!
9c96caaccSSatish Balay
10*0ccf82acSMartin Diehlpure subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b)
11*0ccf82acSMartin Diehl  implicit none (type, external)
12*0ccf82acSMartin Diehl  MatScalar, intent(in) :: a(0:*)
13*0ccf82acSMartin Diehl  PetscScalar, intent(inout) :: x(0:*)
14*0ccf82acSMartin Diehl  PetscScalar, intent(in) :: b(0:*)
15*0ccf82acSMartin Diehl  PetscInt, intent(in) :: n
16*0ccf82acSMartin Diehl  PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*)
17c96caaccSSatish Balay
18*0ccf82acSMartin Diehl  PetscInt :: i,j,jstart,jend
19*0ccf82acSMartin Diehl  PetscInt :: idx,ax,jdx
20*0ccf82acSMartin Diehl  PetscScalar :: s1,s2,s3,s4
21*0ccf82acSMartin Diehl  PetscScalar :: x1,x2,x3,x4
22*0ccf82acSMartin Diehl
23c96caaccSSatish Balay  PETSC_AssertAlignx(16,a(1))
24c96caaccSSatish Balay  PETSC_AssertAlignx(16,x(1))
25c96caaccSSatish Balay  PETSC_AssertAlignx(16,b(1))
26c96caaccSSatish Balay  PETSC_AssertAlignx(16,ai(1))
27c96caaccSSatish Balay  PETSC_AssertAlignx(16,aj(1))
28c96caaccSSatish Balay  PETSC_AssertAlignx(16,adiag(1))
29c96caaccSSatish Balay
30*0ccf82acSMartin Diehl  !
31*0ccf82acSMartin Diehl  ! Forward Solve
32*0ccf82acSMartin Diehl  !
33c96caaccSSatish Balay  x(0) = b(0)
34c96caaccSSatish Balay  x(1) = b(1)
35c96caaccSSatish Balay  x(2) = b(2)
36c96caaccSSatish Balay  x(3) = b(3)
37c96caaccSSatish Balay  idx  = 0
380113e719SMartin Diehl  do i=1,n-1
39c96caaccSSatish Balay    jstart = ai(i)
40c96caaccSSatish Balay    jend   = adiag(i) - 1
41c96caaccSSatish Balay    ax     = 16*jstart
42c96caaccSSatish Balay    idx    = idx + 4
43c96caaccSSatish Balay    s1     = b(idx)
44c96caaccSSatish Balay    s2     = b(idx+1)
45c96caaccSSatish Balay    s3     = b(idx+2)
46c96caaccSSatish Balay    s4     = b(idx+3)
470113e719SMartin Diehl    do j=jstart,jend
48c96caaccSSatish Balay      jdx = 4*aj(j)
49c96caaccSSatish Balay
50c96caaccSSatish Balay      x1  = x(jdx)
51c96caaccSSatish Balay      x2  = x(jdx+1)
52c96caaccSSatish Balay      x3  = x(jdx+2)
53c96caaccSSatish Balay      x4  = x(jdx+3)
54c96caaccSSatish Balay      s1  = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
55c96caaccSSatish Balay      s2  = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
56c96caaccSSatish Balay      s3  = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
57c96caaccSSatish Balay      s4  = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
58c96caaccSSatish Balay      ax  = ax + 16
590113e719SMartin Diehl    end do
60c96caaccSSatish Balay    x(idx)   = s1
61c96caaccSSatish Balay    x(idx+1) = s2
62c96caaccSSatish Balay    x(idx+2) = s3
63c96caaccSSatish Balay    x(idx+3) = s4
640113e719SMartin Diehl  end do
65c96caaccSSatish Balay
66c96caaccSSatish Balay  !
67c96caaccSSatish Balay  ! Backward solve the upper triangular
68c96caaccSSatish Balay  !
690113e719SMartin Diehl  do i=n-1,0,-1
70c96caaccSSatish Balay    jstart  = adiag(i) + 1
71c96caaccSSatish Balay    jend    = ai(i+1) - 1
72c96caaccSSatish Balay    ax     = 16*jstart
73c96caaccSSatish Balay    s1      = x(idx)
74c96caaccSSatish Balay    s2      = x(idx+1)
75c96caaccSSatish Balay    s3      = x(idx+2)
76c96caaccSSatish Balay    s4      = x(idx+3)
770113e719SMartin Diehl    do j=jstart,jend
78c96caaccSSatish Balay      jdx   = 4*aj(j)
79c96caaccSSatish Balay      x1    = x(jdx)
80c96caaccSSatish Balay      x2    = x(jdx+1)
81c96caaccSSatish Balay      x3    = x(jdx+2)
82c96caaccSSatish Balay      x4    = x(jdx+3)
83c96caaccSSatish Balay      s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
84c96caaccSSatish Balay      s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
85c96caaccSSatish Balay      s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
86c96caaccSSatish Balay      s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
87c96caaccSSatish Balay      ax = ax + 16
880113e719SMartin Diehl    end do
89c96caaccSSatish Balay    ax      = 16*adiag(i)
90c96caaccSSatish Balay    x(idx)   = a(ax)*s1  +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4
91c96caaccSSatish Balay    x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4
92c96caaccSSatish Balay    x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4
93c96caaccSSatish Balay    x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4
94c96caaccSSatish Balay    idx      = idx - 4
950113e719SMartin Diehl  end do
960113e719SMartin Diehlend subroutine FortranSolveBAIJ4Unroll
97c96caaccSSatish Balay
98c96caaccSSatish Balay!   version that does not call BLAS 2 operation for each row block
99c96caaccSSatish Balay!
100c96caaccSSatish Balaysubroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w)
101c96caaccSSatish Balay  implicit none
102*0ccf82acSMartin Diehl  MatScalar, intent(in) :: a(0:*)
103*0ccf82acSMartin Diehl  PetscScalar, intent(inout) :: x(0:*),w(0:*)
104*0ccf82acSMartin Diehl  PetscScalar, intent(in) :: b(0:*)
105*0ccf82acSMartin Diehl  PetscInt, intent(in) :: n
106*0ccf82acSMartin Diehl  PetscInt, intent(in) :: ai(0:*), aj(0:*), adiag(0:*)
107c96caaccSSatish Balay
108*0ccf82acSMartin Diehl  PetscInt :: ii,jj,i,j
109*0ccf82acSMartin Diehl  PetscInt :: jstart,jend,idx,ax,jdx,kdx,nn
110*0ccf82acSMartin Diehl  PetscScalar :: s(0:3)
111c96caaccSSatish Balay
112c96caaccSSatish Balay  PETSC_AssertAlignx(16,a(1))
113c96caaccSSatish Balay  PETSC_AssertAlignx(16,w(1))
114c96caaccSSatish Balay  PETSC_AssertAlignx(16,x(1))
115c96caaccSSatish Balay  PETSC_AssertAlignx(16,b(1))
116c96caaccSSatish Balay  PETSC_AssertAlignx(16,ai(1))
117c96caaccSSatish Balay  PETSC_AssertAlignx(16,aj(1))
118c96caaccSSatish Balay  PETSC_AssertAlignx(16,adiag(1))
1190113e719SMartin Diehl  !
1200113e719SMartin Diehl  !     Forward Solve
1210113e719SMartin Diehl  !
122c96caaccSSatish Balay  x(0) = b(0)
123c96caaccSSatish Balay  x(1) = b(1)
124c96caaccSSatish Balay  x(2) = b(2)
125c96caaccSSatish Balay  x(3) = b(3)
126c96caaccSSatish Balay  idx  = 0
1270113e719SMartin Diehl  do i=1,n-1
128c96caaccSSatish Balay    !
129c96caaccSSatish Balay    ! Pack required part of vector into work array
130c96caaccSSatish Balay    !
131c96caaccSSatish Balay    kdx    = 0
132c96caaccSSatish Balay    jstart = ai(i)
133c96caaccSSatish Balay    jend   = adiag(i) - 1
134c96caaccSSatish Balay    if (jend - jstart .ge. 500) then
135c96caaccSSatish Balay      write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
136c96caaccSSatish Balay    endif
1370113e719SMartin Diehl    do j=jstart,jend
138c96caaccSSatish Balay
139c96caaccSSatish Balay      jdx       = 4*aj(j)
140c96caaccSSatish Balay
141c96caaccSSatish Balay      w(kdx)    = x(jdx)
142c96caaccSSatish Balay      w(kdx+1)  = x(jdx+1)
143c96caaccSSatish Balay      w(kdx+2)  = x(jdx+2)
144c96caaccSSatish Balay      w(kdx+3)  = x(jdx+3)
145c96caaccSSatish Balay      kdx       = kdx + 4
1460113e719SMartin Diehl    end do
147c96caaccSSatish Balay
148c96caaccSSatish Balay    ax       = 16*jstart
149c96caaccSSatish Balay    idx      = idx + 4
150c96caaccSSatish Balay    s(0)     = b(idx)
151c96caaccSSatish Balay    s(1)     = b(idx+1)
152c96caaccSSatish Balay    s(2)     = b(idx+2)
153c96caaccSSatish Balay    s(3)     = b(idx+3)
154c96caaccSSatish Balay    !
155c96caaccSSatish Balay    !    s = s - a(ax:)*w
156c96caaccSSatish Balay    !
157c96caaccSSatish Balay    nn = 4*(jend - jstart + 1) - 1
1580113e719SMartin Diehl    do ii=0,3
1590113e719SMartin Diehl      do jj=0,nn
160c96caaccSSatish Balay        s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
1610113e719SMartin Diehl      end do
1620113e719SMartin Diehl    end do
163c96caaccSSatish Balay
164c96caaccSSatish Balay    x(idx)   = s(0)
165c96caaccSSatish Balay    x(idx+1) = s(1)
166c96caaccSSatish Balay    x(idx+2) = s(2)
167c96caaccSSatish Balay    x(idx+3) = s(3)
1680113e719SMartin Diehl  end do
169c96caaccSSatish Balay  !
170c96caaccSSatish Balay  ! Backward solve the upper triangular
171c96caaccSSatish Balay  !
1720113e719SMartin Diehl  do i=n-1,0,-1
173c96caaccSSatish Balay     jstart    = adiag(i) + 1
174c96caaccSSatish Balay     jend      = ai(i+1) - 1
175c96caaccSSatish Balay     ax        = 16*jstart
176c96caaccSSatish Balay     s(0)      = x(idx)
177c96caaccSSatish Balay     s(1)      = x(idx+1)
178c96caaccSSatish Balay     s(2)      = x(idx+2)
179c96caaccSSatish Balay     s(3)      = x(idx+3)
180c96caaccSSatish Balay     !
181c96caaccSSatish Balay     !   Pack each chunk of vector needed
182c96caaccSSatish Balay     !
183c96caaccSSatish Balay     kdx = 0
184c96caaccSSatish Balay     if (jend - jstart .ge. 500) then
185c96caaccSSatish Balay       write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
186c96caaccSSatish Balay     endif
1870113e719SMartin Diehl     do j=jstart,jend
188c96caaccSSatish Balay       jdx      = 4*aj(j)
189c96caaccSSatish Balay       w(kdx)   = x(jdx)
190c96caaccSSatish Balay       w(kdx+1) = x(jdx+1)
191c96caaccSSatish Balay       w(kdx+2) = x(jdx+2)
192c96caaccSSatish Balay       w(kdx+3) = x(jdx+3)
193c96caaccSSatish Balay       kdx      = kdx + 4
1940113e719SMartin Diehl     end do
195c96caaccSSatish Balay     nn = 4*(jend - jstart + 1) - 1
1960113e719SMartin Diehl     do ii=0,3
1970113e719SMartin Diehl       do jj=0,nn
198c96caaccSSatish Balay         s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
1990113e719SMartin Diehl       end do
2000113e719SMartin Diehl     end do
201c96caaccSSatish Balay
202c96caaccSSatish Balay     ax      = 16*adiag(i)
203c96caaccSSatish Balay     x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
204c96caaccSSatish Balay     x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
205c96caaccSSatish Balay     x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
206c96caaccSSatish Balay     x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
207c96caaccSSatish Balay     idx     = idx - 4
2080113e719SMartin Diehl  end do
2090113e719SMartin Diehlend subroutine FortranSolveBAIJ4
210