xref: /petsc/src/mat/impls/baij/seq/ftn-kernels/fsolvebaij.F90 (revision c96caacc6f00ba95b366aeae86152d44f6880d6b)
1*c96caaccSSatish Balay!
2*c96caaccSSatish Balay!
3*c96caaccSSatish Balay!    Fortran kernel for sparse triangular solve in the BAIJ matrix format
4*c96caaccSSatish Balay! This ONLY works for factorizations in the NATURAL ORDERING, i.e.
5*c96caaccSSatish Balay! with MatSolve_SeqBAIJ_4_NaturalOrdering()
6*c96caaccSSatish Balay!
7*c96caaccSSatish Balay#include <petsc/finclude/petscsys.h>
8*c96caaccSSatish Balay!
9*c96caaccSSatish Balay
10*c96caaccSSatish Balay      subroutine FortranSolveBAIJ4Unroll(n,x,ai,aj,adiag,a,b)
11*c96caaccSSatish Balay      implicit none
12*c96caaccSSatish Balay      MatScalar   a(0:*)
13*c96caaccSSatish Balay      PetscScalar x(0:*)
14*c96caaccSSatish Balay      PetscScalar b(0:*)
15*c96caaccSSatish Balay      PetscInt    n
16*c96caaccSSatish Balay      PetscInt    ai(0:*)
17*c96caaccSSatish Balay      PetscInt    aj(0:*)
18*c96caaccSSatish Balay      PetscInt    adiag(0:*)
19*c96caaccSSatish Balay
20*c96caaccSSatish Balay      PetscInt    i,j,jstart,jend
21*c96caaccSSatish Balay      PetscInt    idx,ax,jdx
22*c96caaccSSatish Balay      PetscScalar s1,s2,s3,s4
23*c96caaccSSatish Balay      PetscScalar x1,x2,x3,x4
24*c96caaccSSatish Balay!
25*c96caaccSSatish Balay!     Forward Solve
26*c96caaccSSatish Balay!
27*c96caaccSSatish Balay      PETSC_AssertAlignx(16,a(1))
28*c96caaccSSatish Balay      PETSC_AssertAlignx(16,x(1))
29*c96caaccSSatish Balay      PETSC_AssertAlignx(16,b(1))
30*c96caaccSSatish Balay      PETSC_AssertAlignx(16,ai(1))
31*c96caaccSSatish Balay      PETSC_AssertAlignx(16,aj(1))
32*c96caaccSSatish Balay      PETSC_AssertAlignx(16,adiag(1))
33*c96caaccSSatish Balay
34*c96caaccSSatish Balay         x(0) = b(0)
35*c96caaccSSatish Balay         x(1) = b(1)
36*c96caaccSSatish Balay         x(2) = b(2)
37*c96caaccSSatish Balay         x(3) = b(3)
38*c96caaccSSatish Balay         idx  = 0
39*c96caaccSSatish Balay         do 20 i=1,n-1
40*c96caaccSSatish Balay            jstart = ai(i)
41*c96caaccSSatish Balay            jend   = adiag(i) - 1
42*c96caaccSSatish Balay            ax    = 16*jstart
43*c96caaccSSatish Balay            idx    = idx + 4
44*c96caaccSSatish Balay            s1     = b(idx)
45*c96caaccSSatish Balay            s2     = b(idx+1)
46*c96caaccSSatish Balay            s3     = b(idx+2)
47*c96caaccSSatish Balay            s4     = b(idx+3)
48*c96caaccSSatish Balay            do 30 j=jstart,jend
49*c96caaccSSatish Balay              jdx   = 4*aj(j)
50*c96caaccSSatish Balay
51*c96caaccSSatish Balay              x1    = x(jdx)
52*c96caaccSSatish Balay              x2    = x(jdx+1)
53*c96caaccSSatish Balay              x3    = x(jdx+2)
54*c96caaccSSatish Balay              x4    = x(jdx+3)
55*c96caaccSSatish Balay              s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
56*c96caaccSSatish Balay              s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
57*c96caaccSSatish Balay              s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
58*c96caaccSSatish Balay              s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
59*c96caaccSSatish Balay              ax = ax + 16
60*c96caaccSSatish Balay 30         continue
61*c96caaccSSatish Balay            x(idx)   = s1
62*c96caaccSSatish Balay            x(idx+1) = s2
63*c96caaccSSatish Balay            x(idx+2) = s3
64*c96caaccSSatish Balay            x(idx+3) = s4
65*c96caaccSSatish Balay 20      continue
66*c96caaccSSatish Balay
67*c96caaccSSatish Balay
68*c96caaccSSatish Balay!
69*c96caaccSSatish Balay!     Backward solve the upper triangular
70*c96caaccSSatish Balay!
71*c96caaccSSatish Balay         do 40 i=n-1,0,-1
72*c96caaccSSatish Balay            jstart  = adiag(i) + 1
73*c96caaccSSatish Balay            jend    = ai(i+1) - 1
74*c96caaccSSatish Balay            ax     = 16*jstart
75*c96caaccSSatish Balay            s1      = x(idx)
76*c96caaccSSatish Balay            s2      = x(idx+1)
77*c96caaccSSatish Balay            s3      = x(idx+2)
78*c96caaccSSatish Balay            s4      = x(idx+3)
79*c96caaccSSatish Balay            do 50 j=jstart,jend
80*c96caaccSSatish Balay              jdx   = 4*aj(j)
81*c96caaccSSatish Balay              x1    = x(jdx)
82*c96caaccSSatish Balay              x2    = x(jdx+1)
83*c96caaccSSatish Balay              x3    = x(jdx+2)
84*c96caaccSSatish Balay              x4    = x(jdx+3)
85*c96caaccSSatish Balay              s1 = s1-(a(ax)*x1  +a(ax+4)*x2+a(ax+8)*x3 +a(ax+12)*x4)
86*c96caaccSSatish Balay              s2 = s2-(a(ax+1)*x1+a(ax+5)*x2+a(ax+9)*x3 +a(ax+13)*x4)
87*c96caaccSSatish Balay              s3 = s3-(a(ax+2)*x1+a(ax+6)*x2+a(ax+10)*x3+a(ax+14)*x4)
88*c96caaccSSatish Balay              s4 = s4-(a(ax+3)*x1+a(ax+7)*x2+a(ax+11)*x3+a(ax+15)*x4)
89*c96caaccSSatish Balay              ax = ax + 16
90*c96caaccSSatish Balay 50         continue
91*c96caaccSSatish Balay            ax      = 16*adiag(i)
92*c96caaccSSatish Balay            x(idx)   = a(ax)*s1  +a(ax+4)*s2+a(ax+8)*s3 +a(ax+12)*s4
93*c96caaccSSatish Balay            x(idx+1) = a(ax+1)*s1+a(ax+5)*s2+a(ax+9)*s3 +a(ax+13)*s4
94*c96caaccSSatish Balay            x(idx+2) = a(ax+2)*s1+a(ax+6)*s2+a(ax+10)*s3+a(ax+14)*s4
95*c96caaccSSatish Balay            x(idx+3) = a(ax+3)*s1+a(ax+7)*s2+a(ax+11)*s3+a(ax+15)*s4
96*c96caaccSSatish Balay            idx      = idx - 4
97*c96caaccSSatish Balay 40      continue
98*c96caaccSSatish Balay      return
99*c96caaccSSatish Balay      end
100*c96caaccSSatish Balay
101*c96caaccSSatish Balay!   version that does not call BLAS 2 operation for each row block
102*c96caaccSSatish Balay!
103*c96caaccSSatish Balay      subroutine FortranSolveBAIJ4(n,x,ai,aj,adiag,a,b,w)
104*c96caaccSSatish Balay      implicit none
105*c96caaccSSatish Balay      MatScalar   a(0:*)
106*c96caaccSSatish Balay      PetscScalar x(0:*),b(0:*),w(0:*)
107*c96caaccSSatish Balay      PetscInt  n,ai(0:*),aj(0:*),adiag(0:*)
108*c96caaccSSatish Balay      PetscInt  ii,jj,i,j
109*c96caaccSSatish Balay
110*c96caaccSSatish Balay      PetscInt  jstart,jend,idx,ax,jdx,kdx,nn
111*c96caaccSSatish Balay      PetscScalar s(0:3)
112*c96caaccSSatish Balay
113*c96caaccSSatish Balay!
114*c96caaccSSatish Balay!     Forward Solve
115*c96caaccSSatish Balay!
116*c96caaccSSatish Balay
117*c96caaccSSatish Balay      PETSC_AssertAlignx(16,a(1))
118*c96caaccSSatish Balay      PETSC_AssertAlignx(16,w(1))
119*c96caaccSSatish Balay      PETSC_AssertAlignx(16,x(1))
120*c96caaccSSatish Balay      PETSC_AssertAlignx(16,b(1))
121*c96caaccSSatish Balay      PETSC_AssertAlignx(16,ai(1))
122*c96caaccSSatish Balay      PETSC_AssertAlignx(16,aj(1))
123*c96caaccSSatish Balay      PETSC_AssertAlignx(16,adiag(1))
124*c96caaccSSatish Balay
125*c96caaccSSatish Balay      x(0) = b(0)
126*c96caaccSSatish Balay      x(1) = b(1)
127*c96caaccSSatish Balay      x(2) = b(2)
128*c96caaccSSatish Balay      x(3) = b(3)
129*c96caaccSSatish Balay      idx  = 0
130*c96caaccSSatish Balay      do 20 i=1,n-1
131*c96caaccSSatish Balay!
132*c96caaccSSatish Balay!        Pack required part of vector into work array
133*c96caaccSSatish Balay!
134*c96caaccSSatish Balay         kdx    = 0
135*c96caaccSSatish Balay         jstart = ai(i)
136*c96caaccSSatish Balay         jend   = adiag(i) - 1
137*c96caaccSSatish Balay         if (jend - jstart .ge. 500) then
138*c96caaccSSatish Balay           write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
139*c96caaccSSatish Balay         endif
140*c96caaccSSatish Balay         do 30 j=jstart,jend
141*c96caaccSSatish Balay
142*c96caaccSSatish Balay           jdx       = 4*aj(j)
143*c96caaccSSatish Balay
144*c96caaccSSatish Balay           w(kdx)    = x(jdx)
145*c96caaccSSatish Balay           w(kdx+1)  = x(jdx+1)
146*c96caaccSSatish Balay           w(kdx+2)  = x(jdx+2)
147*c96caaccSSatish Balay           w(kdx+3)  = x(jdx+3)
148*c96caaccSSatish Balay           kdx       = kdx + 4
149*c96caaccSSatish Balay 30      continue
150*c96caaccSSatish Balay
151*c96caaccSSatish Balay         ax       = 16*jstart
152*c96caaccSSatish Balay         idx      = idx + 4
153*c96caaccSSatish Balay         s(0)     = b(idx)
154*c96caaccSSatish Balay         s(1)     = b(idx+1)
155*c96caaccSSatish Balay         s(2)     = b(idx+2)
156*c96caaccSSatish Balay         s(3)     = b(idx+3)
157*c96caaccSSatish Balay!
158*c96caaccSSatish Balay!    s = s - a(ax:)*w
159*c96caaccSSatish Balay!
160*c96caaccSSatish Balay         nn = 4*(jend - jstart + 1) - 1
161*c96caaccSSatish Balay         do 100, ii=0,3
162*c96caaccSSatish Balay           do 110, jj=0,nn
163*c96caaccSSatish Balay             s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
164*c96caaccSSatish Balay 110       continue
165*c96caaccSSatish Balay 100     continue
166*c96caaccSSatish Balay
167*c96caaccSSatish Balay         x(idx)   = s(0)
168*c96caaccSSatish Balay         x(idx+1) = s(1)
169*c96caaccSSatish Balay         x(idx+2) = s(2)
170*c96caaccSSatish Balay         x(idx+3) = s(3)
171*c96caaccSSatish Balay 20   continue
172*c96caaccSSatish Balay
173*c96caaccSSatish Balay!
174*c96caaccSSatish Balay!     Backward solve the upper triangular
175*c96caaccSSatish Balay!
176*c96caaccSSatish Balay      do 40 i=n-1,0,-1
177*c96caaccSSatish Balay         jstart    = adiag(i) + 1
178*c96caaccSSatish Balay         jend      = ai(i+1) - 1
179*c96caaccSSatish Balay         ax        = 16*jstart
180*c96caaccSSatish Balay         s(0)      = x(idx)
181*c96caaccSSatish Balay         s(1)      = x(idx+1)
182*c96caaccSSatish Balay         s(2)      = x(idx+2)
183*c96caaccSSatish Balay         s(3)      = x(idx+3)
184*c96caaccSSatish Balay!
185*c96caaccSSatish Balay!   Pack each chunk of vector needed
186*c96caaccSSatish Balay!
187*c96caaccSSatish Balay         kdx = 0
188*c96caaccSSatish Balay         if (jend - jstart .ge. 500) then
189*c96caaccSSatish Balay           write(6,*) 'Overflowing vector FortranSolveBAIJ4()'
190*c96caaccSSatish Balay         endif
191*c96caaccSSatish Balay         do 50 j=jstart,jend
192*c96caaccSSatish Balay           jdx      = 4*aj(j)
193*c96caaccSSatish Balay           w(kdx)   = x(jdx)
194*c96caaccSSatish Balay           w(kdx+1) = x(jdx+1)
195*c96caaccSSatish Balay           w(kdx+2) = x(jdx+2)
196*c96caaccSSatish Balay           w(kdx+3) = x(jdx+3)
197*c96caaccSSatish Balay           kdx      = kdx + 4
198*c96caaccSSatish Balay 50      continue
199*c96caaccSSatish Balay         nn = 4*(jend - jstart + 1) - 1
200*c96caaccSSatish Balay         do 200, ii=0,3
201*c96caaccSSatish Balay           do 210, jj=0,nn
202*c96caaccSSatish Balay             s(ii) = s(ii) - a(ax+4*jj+ii)*w(jj)
203*c96caaccSSatish Balay 210       continue
204*c96caaccSSatish Balay 200     continue
205*c96caaccSSatish Balay
206*c96caaccSSatish Balay         ax      = 16*adiag(i)
207*c96caaccSSatish Balay         x(idx)  = a(ax)*s(0)  +a(ax+4)*s(1)+a(ax+8)*s(2) +a(ax+12)*s(3)
208*c96caaccSSatish Balay         x(idx+1)= a(ax+1)*s(0)+a(ax+5)*s(1)+a(ax+9)*s(2) +a(ax+13)*s(3)
209*c96caaccSSatish Balay         x(idx+2)= a(ax+2)*s(0)+a(ax+6)*s(1)+a(ax+10)*s(2)+a(ax+14)*s(3)
210*c96caaccSSatish Balay         x(idx+3)= a(ax+3)*s(0)+a(ax+7)*s(1)+a(ax+11)*s(2)+a(ax+15)*s(3)
211*c96caaccSSatish Balay         idx     = idx - 4
212*c96caaccSSatish Balay 40   continue
213*c96caaccSSatish Balay
214*c96caaccSSatish Balay      return
215*c96caaccSSatish Balay      end
216*c96caaccSSatish Balay
217