xref: /petsc/src/mat/impls/baij/seq/ftn-kernels/fsolvebaij.F90 (revision 0113e7197f2fe148981cb35224c0458dc22f510d)
1c96caaccSSatish Balay!
2c96caaccSSatish Balay!
3c96caaccSSatish Balay!    Fortran kernel for sparse triangular solve in the BAIJ matrix format
4c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e.
5c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering()
6c96caaccSSatish Balay!
7c96caaccSSatish Balay#include <petsc/finclude/petscsys.h>
8c96caaccSSatish Balay!
9c96caaccSSatish Balay
10c96caaccSSatish Balaysubroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b)
11c96caaccSSatish Balay  implicit none
12c96caaccSSatish Balay  MatScalar   a(0:*)
13c96caaccSSatish Balay  PetscScalar x(0:*)
14c96caaccSSatish Balay  PetscScalar b(0:*)
15c96caaccSSatish Balay  PetscInt    n
16c96caaccSSatish Balay  PetscInt    ai(0:*)
17c96caaccSSatish Balay  PetscInt    aj(0:*)
18c96caaccSSatish Balay  PetscInt    adiag(0:*)
19c96caaccSSatish Balay
20c96caaccSSatish Balay  PetscInt    i,j,jstart,jend
21c96caaccSSatish Balay  PetscInt    idx,ax,jdx
22c96caaccSSatish Balay  PetscScalar s1,s2,s3,s4
23c96caaccSSatish Balay  PetscScalar x1,x2,x3,x4
24c96caaccSSatish Balay  !
25c96caaccSSatish Balay  ! Forward Solve
26c96caaccSSatish Balay  !
27c96caaccSSatish Balay  PETSC_AssertAlignx(16,a(1))
28c96caaccSSatish Balay  PETSC_AssertAlignx(16,x(1))
29c96caaccSSatish Balay  PETSC_AssertAlignx(16,b(1))
30c96caaccSSatish Balay  PETSC_AssertAlignx(16,ai(1))
31c96caaccSSatish Balay  PETSC_AssertAlignx(16,aj(1))
32c96caaccSSatish Balay  PETSC_AssertAlignx(16,adiag(1))
33c96caaccSSatish Balay
34c96caaccSSatish Balay  x(0) = b(0)
35c96caaccSSatish Balay  x(1) = b(1)
36c96caaccSSatish Balay  x(2) = b(2)
37c96caaccSSatish Balay  x(3) = b(3)
38c96caaccSSatish Balay  idx  = 0
39*0113e719SMartin Diehl  do i=1,n-1
40c96caaccSSatish Balay    jstart = ai(i)
41c96caaccSSatish Balay    jend   = adiag(i) - 1
42c96caaccSSatish Balay    ax     = 16*jstart
43c96caaccSSatish Balay    idx    = idx + 4
44c96caaccSSatish Balay    s1     = b(idx)
45c96caaccSSatish Balay    s2     = b(idx+1)
46c96caaccSSatish Balay    s3     = b(idx+2)
47c96caaccSSatish Balay    s4     = b(idx+3)
48*0113e719SMartin Diehl    do j=jstart,jend
49c96caaccSSatish Balay      jdx = 4*aj(j)
50c96caaccSSatish Balay
51c96caaccSSatish Balay      x1  = x(jdx)
52c96caaccSSatish Balay      x2  = x(jdx+1)
53c96caaccSSatish Balay      x3  = x(jdx+2)
54c96caaccSSatish Balay      x4  = x(jdx+3)
55c96caaccSSatish Balay      s1  = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
56c96caaccSSatish Balay      s2  = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
57c96caaccSSatish Balay      s3  = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
58c96caaccSSatish Balay      s4  = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
59c96caaccSSatish Balay      ax  = ax + 16
60*0113e719SMartin Diehl    end do
61c96caaccSSatish Balay    x(idx)   = s1
62c96caaccSSatish Balay    x(idx+1) = s2
63c96caaccSSatish Balay    x(idx+2) = s3
64c96caaccSSatish Balay    x(idx+3) = s4
65*0113e719SMartin Diehl  end do
66c96caaccSSatish Balay
67c96caaccSSatish Balay  !
68c96caaccSSatish Balay  ! Backward solve the upper triangular
69c96caaccSSatish Balay  !
70*0113e719SMartin Diehl  do i=n-1,0,-1
71c96caaccSSatish Balay    jstart  = adiag(i) + 1
72c96caaccSSatish Balay    jend    = ai(i+1) - 1
73c96caaccSSatish Balay    ax     = 16*jstart
74c96caaccSSatish Balay    s1      = x(idx)
75c96caaccSSatish Balay    s2      = x(idx+1)
76c96caaccSSatish Balay    s3      = x(idx+2)
77c96caaccSSatish Balay    s4      = x(idx+3)
78*0113e719SMartin Diehl    do j=jstart,jend
79c96caaccSSatish Balay      jdx   = 4*aj(j)
80c96caaccSSatish Balay      x1    = x(jdx)
81c96caaccSSatish Balay      x2    = x(jdx+1)
82c96caaccSSatish Balay      x3    = x(jdx+2)
83c96caaccSSatish Balay      x4    = x(jdx+3)
84c96caaccSSatish Balay      s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
85c96caaccSSatish Balay      s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
86c96caaccSSatish Balay      s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
87c96caaccSSatish Balay      s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
88c96caaccSSatish Balay      ax = ax + 16
89*0113e719SMartin Diehl    end do
90c96caaccSSatish Balay    ax      = 16*adiag(i)
91c96caaccSSatish Balay    x(idx)   = a(ax)*s1  +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4
92c96caaccSSatish Balay    x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4
93c96caaccSSatish Balay    x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4
94c96caaccSSatish Balay    x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4
95c96caaccSSatish Balay    idx      = idx - 4
96*0113e719SMartin Diehl  end do
97*0113e719SMartin Diehlend subroutine FortranSolveBAIJ4Unroll
98c96caaccSSatish Balay
99c96caaccSSatish Balay!   version that does not call BLAS 2 operation for each row block
100c96caaccSSatish Balay!
101c96caaccSSatish Balaysubroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w)
102c96caaccSSatish Balay  implicit none
103c96caaccSSatish Balay  MatScalar   a(0:*)
104c96caaccSSatish Balay  PetscScalar x(0:*),b(0:*),w(0:*)
105c96caaccSSatish Balay  PetscInt  n,ai(0:*),aj(0:*),adiag(0:*)
106c96caaccSSatish Balay  PetscInt  ii,jj,i,j
107c96caaccSSatish Balay
108c96caaccSSatish Balay  PetscInt  jstart,jend,idx,ax,jdx,kdx,nn
109c96caaccSSatish Balay  PetscScalar s(0:3)
110c96caaccSSatish Balay
111c96caaccSSatish Balay  PETSC_AssertAlignx(16,a(1))
112c96caaccSSatish Balay  PETSC_AssertAlignx(16,w(1))
113c96caaccSSatish Balay  PETSC_AssertAlignx(16,x(1))
114c96caaccSSatish Balay  PETSC_AssertAlignx(16,b(1))
115c96caaccSSatish Balay  PETSC_AssertAlignx(16,ai(1))
116c96caaccSSatish Balay  PETSC_AssertAlignx(16,aj(1))
117c96caaccSSatish Balay  PETSC_AssertAlignx(16,adiag(1))
118*0113e719SMartin Diehl  !
119*0113e719SMartin Diehl  !     Forward Solve
120*0113e719SMartin Diehl  !
121c96caaccSSatish Balay  x(0) = b(0)
122c96caaccSSatish Balay  x(1) = b(1)
123c96caaccSSatish Balay  x(2) = b(2)
124c96caaccSSatish Balay  x(3) = b(3)
125c96caaccSSatish Balay  idx  = 0
126*0113e719SMartin Diehl  do i=1,n-1
127c96caaccSSatish Balay    !
128c96caaccSSatish Balay    ! Pack required part of vector into work array
129c96caaccSSatish Balay    !
130c96caaccSSatish Balay    kdx    = 0
131c96caaccSSatish Balay    jstart = ai(i)
132c96caaccSSatish Balay    jend   = adiag(i) - 1
133c96caaccSSatish Balay    if (jend - jstart .ge. 500) then
134c96caaccSSatish Balay      write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
135c96caaccSSatish Balay    endif
136*0113e719SMartin Diehl    do j=jstart,jend
137c96caaccSSatish Balay
138c96caaccSSatish Balay      jdx       = 4*aj(j)
139c96caaccSSatish Balay
140c96caaccSSatish Balay      w(kdx)    = x(jdx)
141c96caaccSSatish Balay      w(kdx+1)  = x(jdx+1)
142c96caaccSSatish Balay      w(kdx+2)  = x(jdx+2)
143c96caaccSSatish Balay      w(kdx+3)  = x(jdx+3)
144c96caaccSSatish Balay      kdx       = kdx + 4
145*0113e719SMartin Diehl    end do
146c96caaccSSatish Balay
147c96caaccSSatish Balay    ax       = 16*jstart
148c96caaccSSatish Balay    idx      = idx + 4
149c96caaccSSatish Balay    s(0)     = b(idx)
150c96caaccSSatish Balay    s(1)     = b(idx+1)
151c96caaccSSatish Balay    s(2)     = b(idx+2)
152c96caaccSSatish Balay    s(3)     = b(idx+3)
153c96caaccSSatish Balay    !
154c96caaccSSatish Balay    !    s = s - a(ax:)*w
155c96caaccSSatish Balay    !
156c96caaccSSatish Balay    nn = 4*(jend - jstart + 1) - 1
157*0113e719SMartin Diehl    do ii=0,3
158*0113e719SMartin Diehl      do jj=0,nn
159c96caaccSSatish Balay        s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
160*0113e719SMartin Diehl      end do
161*0113e719SMartin Diehl    end do
162c96caaccSSatish Balay
163c96caaccSSatish Balay    x(idx)   = s(0)
164c96caaccSSatish Balay    x(idx+1) = s(1)
165c96caaccSSatish Balay    x(idx+2) = s(2)
166c96caaccSSatish Balay    x(idx+3) = s(3)
167*0113e719SMartin Diehl  end do
168c96caaccSSatish Balay  !
169c96caaccSSatish Balay  ! Backward solve the upper triangular
170c96caaccSSatish Balay  !
171*0113e719SMartin Diehl  do i=n-1,0,-1
172c96caaccSSatish Balay     jstart    = adiag(i) + 1
173c96caaccSSatish Balay     jend      = ai(i+1) - 1
174c96caaccSSatish Balay     ax        = 16*jstart
175c96caaccSSatish Balay     s(0)      = x(idx)
176c96caaccSSatish Balay     s(1)      = x(idx+1)
177c96caaccSSatish Balay     s(2)      = x(idx+2)
178c96caaccSSatish Balay     s(3)      = x(idx+3)
179c96caaccSSatish Balay     !
180c96caaccSSatish Balay     !   Pack each chunk of vector needed
181c96caaccSSatish Balay     !
182c96caaccSSatish Balay     kdx = 0
183c96caaccSSatish Balay     if (jend - jstart .ge. 500) then
184c96caaccSSatish Balay       write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
185c96caaccSSatish Balay     endif
186*0113e719SMartin Diehl     do j=jstart,jend
187c96caaccSSatish Balay       jdx      = 4*aj(j)
188c96caaccSSatish Balay       w(kdx)   = x(jdx)
189c96caaccSSatish Balay       w(kdx+1) = x(jdx+1)
190c96caaccSSatish Balay       w(kdx+2) = x(jdx+2)
191c96caaccSSatish Balay       w(kdx+3) = x(jdx+3)
192c96caaccSSatish Balay       kdx      = kdx + 4
193*0113e719SMartin Diehl     end do
194c96caaccSSatish Balay     nn = 4*(jend - jstart + 1) - 1
195*0113e719SMartin Diehl     do ii=0,3
196*0113e719SMartin Diehl       do jj=0,nn
197c96caaccSSatish Balay         s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
198*0113e719SMartin Diehl       end do
199*0113e719SMartin Diehl     end do
200c96caaccSSatish Balay
201c96caaccSSatish Balay     ax      = 16*adiag(i)
202c96caaccSSatish Balay     x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
203c96caaccSSatish Balay     x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
204c96caaccSSatish Balay     x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
205c96caaccSSatish Balay     x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
206c96caaccSSatish Balay     idx     = idx - 4
207*0113e719SMartin Diehl  end do
208c96caaccSSatish Balay
209*0113e719SMartin Diehlend subroutine FortranSolveBAIJ4
210