1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 5 { 6 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 7 IS iscol=a->col,isrow=a->row; 8 const PetscInt *r,*c,*rout,*cout; 9 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 10 PetscInt i,nz; 11 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 12 const MatScalar *aa=a->a,*v; 13 PetscScalar *x,*s,*t,*ls; 14 const PetscScalar *b; 15 16 PetscFunctionBegin; 17 CHKERRQ(VecGetArrayRead(bb,&b)); 18 CHKERRQ(VecGetArray(xx,&x)); 19 t = a->solve_work; 20 21 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 22 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 23 24 /* forward solve the lower triangular */ 25 CHKERRQ(PetscArraycpy(t,b+bs*(*r++),bs)); 26 for (i=1; i<n; i++) { 27 v = aa + bs2*ai[i]; 28 vi = aj + ai[i]; 29 nz = a->diag[i] - ai[i]; 30 s = t + bs*i; 31 CHKERRQ(PetscArraycpy(s,b+bs*(*r++),bs)); 32 while (nz--) { 33 PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 34 v += bs2; 35 } 36 } 37 /* backward solve the upper triangular */ 38 ls = a->solve_work + A->cmap->n; 39 for (i=n-1; i>=0; i--) { 40 v = aa + bs2*(a->diag[i] + 1); 41 vi = aj + a->diag[i] + 1; 42 nz = ai[i+1] - a->diag[i] - 1; 43 CHKERRQ(PetscArraycpy(ls,t+i*bs,bs)); 44 while (nz--) { 45 PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 46 v += bs2; 47 } 48 PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 49 CHKERRQ(PetscArraycpy(x + bs*(*c--),t+i*bs,bs)); 50 } 51 52 CHKERRQ(ISRestoreIndices(isrow,&rout)); 53 CHKERRQ(ISRestoreIndices(iscol,&cout)); 54 CHKERRQ(VecRestoreArrayRead(bb,&b)); 55 CHKERRQ(VecRestoreArray(xx,&x)); 56 CHKERRQ(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n)); 57 PetscFunctionReturn(0); 58 } 59 60 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 63 IS iscol=a->col,isrow=a->row; 64 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 65 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 66 PetscInt i,nz,idx,idt,idc; 67 const MatScalar *aa=a->a,*v; 68 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 69 const PetscScalar *b; 70 71 PetscFunctionBegin; 72 CHKERRQ(VecGetArrayRead(bb,&b)); 73 CHKERRQ(VecGetArray(xx,&x)); 74 t = a->solve_work; 75 76 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 77 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 78 79 /* forward solve the lower triangular */ 80 idx = 7*(*r++); 81 t[0] = b[idx]; t[1] = b[1+idx]; 82 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 83 t[5] = b[5+idx]; t[6] = b[6+idx]; 84 85 for (i=1; i<n; i++) { 86 v = aa + 49*ai[i]; 87 vi = aj + ai[i]; 88 nz = diag[i] - ai[i]; 89 idx = 7*(*r++); 90 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 91 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 92 while (nz--) { 93 idx = 7*(*vi++); 94 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 95 x4 = t[3+idx];x5 = t[4+idx]; 96 x6 = t[5+idx];x7 = t[6+idx]; 97 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 98 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 99 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 100 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 101 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 102 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 103 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 104 v += 49; 105 } 106 idx = 7*i; 107 t[idx] = s1;t[1+idx] = s2; 108 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 109 t[5+idx] = s6;t[6+idx] = s7; 110 } 111 /* backward solve the upper triangular */ 112 for (i=n-1; i>=0; i--) { 113 v = aa + 49*diag[i] + 49; 114 vi = aj + diag[i] + 1; 115 nz = ai[i+1] - diag[i] - 1; 116 idt = 7*i; 117 s1 = t[idt]; s2 = t[1+idt]; 118 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 119 s6 = t[5+idt];s7 = t[6+idt]; 120 while (nz--) { 121 idx = 7*(*vi++); 122 x1 = t[idx]; x2 = t[1+idx]; 123 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 124 x6 = t[5+idx]; x7 = t[6+idx]; 125 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 126 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 127 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 128 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 129 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 130 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 131 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 132 v += 49; 133 } 134 idc = 7*(*c--); 135 v = aa + 49*diag[i]; 136 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 137 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 138 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 139 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 140 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 141 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 142 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 143 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 144 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 145 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 146 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 147 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 148 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 149 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 150 } 151 152 CHKERRQ(ISRestoreIndices(isrow,&rout)); 153 CHKERRQ(ISRestoreIndices(iscol,&cout)); 154 CHKERRQ(VecRestoreArrayRead(bb,&b)); 155 CHKERRQ(VecRestoreArray(xx,&x)); 156 CHKERRQ(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n)); 157 PetscFunctionReturn(0); 158 } 159 160 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 161 { 162 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 163 IS iscol=a->col,isrow=a->row; 164 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 165 const PetscInt n=a->mbs,*rout,*cout,*vi; 166 PetscInt i,nz,idx,idt,idc,m; 167 const MatScalar *aa=a->a,*v; 168 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 169 const PetscScalar *b; 170 171 PetscFunctionBegin; 172 CHKERRQ(VecGetArrayRead(bb,&b)); 173 CHKERRQ(VecGetArray(xx,&x)); 174 t = a->solve_work; 175 176 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 177 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 178 179 /* forward solve the lower triangular */ 180 idx = 7*r[0]; 181 t[0] = b[idx]; t[1] = b[1+idx]; 182 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 183 t[5] = b[5+idx]; t[6] = b[6+idx]; 184 185 for (i=1; i<n; i++) { 186 v = aa + 49*ai[i]; 187 vi = aj + ai[i]; 188 nz = ai[i+1] - ai[i]; 189 idx = 7*r[i]; 190 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 191 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 192 for (m=0; m<nz; m++) { 193 idx = 7*vi[m]; 194 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 195 x4 = t[3+idx];x5 = t[4+idx]; 196 x6 = t[5+idx];x7 = t[6+idx]; 197 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 198 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 199 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 200 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 201 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 202 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 203 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 204 v += 49; 205 } 206 idx = 7*i; 207 t[idx] = s1;t[1+idx] = s2; 208 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 209 t[5+idx] = s6;t[6+idx] = s7; 210 } 211 /* backward solve the upper triangular */ 212 for (i=n-1; i>=0; i--) { 213 v = aa + 49*(adiag[i+1]+1); 214 vi = aj + adiag[i+1]+1; 215 nz = adiag[i] - adiag[i+1] - 1; 216 idt = 7*i; 217 s1 = t[idt]; s2 = t[1+idt]; 218 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 219 s6 = t[5+idt];s7 = t[6+idt]; 220 for (m=0; m<nz; m++) { 221 idx = 7*vi[m]; 222 x1 = t[idx]; x2 = t[1+idx]; 223 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 224 x6 = t[5+idx]; x7 = t[6+idx]; 225 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 226 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 227 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 228 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 229 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 230 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 231 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 232 v += 49; 233 } 234 idc = 7*c[i]; 235 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 236 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 237 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 238 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 239 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 240 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 241 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 242 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 243 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 244 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 245 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 246 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 247 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 248 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 249 } 250 251 CHKERRQ(ISRestoreIndices(isrow,&rout)); 252 CHKERRQ(ISRestoreIndices(iscol,&cout)); 253 CHKERRQ(VecRestoreArrayRead(bb,&b)); 254 CHKERRQ(VecRestoreArray(xx,&x)); 255 CHKERRQ(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n)); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 260 { 261 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 262 IS iscol=a->col,isrow=a->row; 263 const PetscInt *r,*c,*rout,*cout; 264 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 265 PetscInt i,nz,idx,idt,idc; 266 const MatScalar *aa=a->a,*v; 267 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 268 const PetscScalar *b; 269 270 PetscFunctionBegin; 271 CHKERRQ(VecGetArrayRead(bb,&b)); 272 CHKERRQ(VecGetArray(xx,&x)); 273 t = a->solve_work; 274 275 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 276 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 277 278 /* forward solve the lower triangular */ 279 idx = 6*(*r++); 280 t[0] = b[idx]; t[1] = b[1+idx]; 281 t[2] = b[2+idx]; t[3] = b[3+idx]; 282 t[4] = b[4+idx]; t[5] = b[5+idx]; 283 for (i=1; i<n; i++) { 284 v = aa + 36*ai[i]; 285 vi = aj + ai[i]; 286 nz = diag[i] - ai[i]; 287 idx = 6*(*r++); 288 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 289 s5 = b[4+idx]; s6 = b[5+idx]; 290 while (nz--) { 291 idx = 6*(*vi++); 292 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 293 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 294 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 295 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 296 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 297 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 298 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 299 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 300 v += 36; 301 } 302 idx = 6*i; 303 t[idx] = s1;t[1+idx] = s2; 304 t[2+idx] = s3;t[3+idx] = s4; 305 t[4+idx] = s5;t[5+idx] = s6; 306 } 307 /* backward solve the upper triangular */ 308 for (i=n-1; i>=0; i--) { 309 v = aa + 36*diag[i] + 36; 310 vi = aj + diag[i] + 1; 311 nz = ai[i+1] - diag[i] - 1; 312 idt = 6*i; 313 s1 = t[idt]; s2 = t[1+idt]; 314 s3 = t[2+idt];s4 = t[3+idt]; 315 s5 = t[4+idt];s6 = t[5+idt]; 316 while (nz--) { 317 idx = 6*(*vi++); 318 x1 = t[idx]; x2 = t[1+idx]; 319 x3 = t[2+idx]; x4 = t[3+idx]; 320 x5 = t[4+idx]; x6 = t[5+idx]; 321 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 322 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 323 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 324 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 325 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 326 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 327 v += 36; 328 } 329 idc = 6*(*c--); 330 v = aa + 36*diag[i]; 331 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 332 v[18]*s4+v[24]*s5+v[30]*s6; 333 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 334 v[19]*s4+v[25]*s5+v[31]*s6; 335 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 336 v[20]*s4+v[26]*s5+v[32]*s6; 337 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 338 v[21]*s4+v[27]*s5+v[33]*s6; 339 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 340 v[22]*s4+v[28]*s5+v[34]*s6; 341 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 342 v[23]*s4+v[29]*s5+v[35]*s6; 343 } 344 345 CHKERRQ(ISRestoreIndices(isrow,&rout)); 346 CHKERRQ(ISRestoreIndices(iscol,&cout)); 347 CHKERRQ(VecRestoreArrayRead(bb,&b)); 348 CHKERRQ(VecRestoreArray(xx,&x)); 349 CHKERRQ(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 350 PetscFunctionReturn(0); 351 } 352 353 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 354 { 355 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 356 IS iscol=a->col,isrow=a->row; 357 const PetscInt *r,*c,*rout,*cout; 358 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 359 PetscInt i,nz,idx,idt,idc,m; 360 const MatScalar *aa=a->a,*v; 361 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 362 const PetscScalar *b; 363 364 PetscFunctionBegin; 365 CHKERRQ(VecGetArrayRead(bb,&b)); 366 CHKERRQ(VecGetArray(xx,&x)); 367 t = a->solve_work; 368 369 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 370 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 371 372 /* forward solve the lower triangular */ 373 idx = 6*r[0]; 374 t[0] = b[idx]; t[1] = b[1+idx]; 375 t[2] = b[2+idx]; t[3] = b[3+idx]; 376 t[4] = b[4+idx]; t[5] = b[5+idx]; 377 for (i=1; i<n; i++) { 378 v = aa + 36*ai[i]; 379 vi = aj + ai[i]; 380 nz = ai[i+1] - ai[i]; 381 idx = 6*r[i]; 382 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 383 s5 = b[4+idx]; s6 = b[5+idx]; 384 for (m=0; m<nz; m++) { 385 idx = 6*vi[m]; 386 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 387 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 388 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 389 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 390 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 391 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 392 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 393 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 394 v += 36; 395 } 396 idx = 6*i; 397 t[idx] = s1;t[1+idx] = s2; 398 t[2+idx] = s3;t[3+idx] = s4; 399 t[4+idx] = s5;t[5+idx] = s6; 400 } 401 /* backward solve the upper triangular */ 402 for (i=n-1; i>=0; i--) { 403 v = aa + 36*(adiag[i+1]+1); 404 vi = aj + adiag[i+1]+1; 405 nz = adiag[i] - adiag[i+1] - 1; 406 idt = 6*i; 407 s1 = t[idt]; s2 = t[1+idt]; 408 s3 = t[2+idt];s4 = t[3+idt]; 409 s5 = t[4+idt];s6 = t[5+idt]; 410 for (m=0; m<nz; m++) { 411 idx = 6*vi[m]; 412 x1 = t[idx]; x2 = t[1+idx]; 413 x3 = t[2+idx]; x4 = t[3+idx]; 414 x5 = t[4+idx]; x6 = t[5+idx]; 415 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 416 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 417 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 418 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 419 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 420 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 421 v += 36; 422 } 423 idc = 6*c[i]; 424 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 425 v[18]*s4+v[24]*s5+v[30]*s6; 426 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 427 v[19]*s4+v[25]*s5+v[31]*s6; 428 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 429 v[20]*s4+v[26]*s5+v[32]*s6; 430 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 431 v[21]*s4+v[27]*s5+v[33]*s6; 432 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 433 v[22]*s4+v[28]*s5+v[34]*s6; 434 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 435 v[23]*s4+v[29]*s5+v[35]*s6; 436 } 437 438 CHKERRQ(ISRestoreIndices(isrow,&rout)); 439 CHKERRQ(ISRestoreIndices(iscol,&cout)); 440 CHKERRQ(VecRestoreArrayRead(bb,&b)); 441 CHKERRQ(VecRestoreArray(xx,&x)); 442 CHKERRQ(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 443 PetscFunctionReturn(0); 444 } 445 446 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 447 { 448 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 449 IS iscol=a->col,isrow=a->row; 450 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 451 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 452 PetscInt i,nz,idx,idt,idc; 453 const MatScalar *aa=a->a,*v; 454 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 455 const PetscScalar *b; 456 457 PetscFunctionBegin; 458 CHKERRQ(VecGetArrayRead(bb,&b)); 459 CHKERRQ(VecGetArray(xx,&x)); 460 t = a->solve_work; 461 462 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 463 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 464 465 /* forward solve the lower triangular */ 466 idx = 5*(*r++); 467 t[0] = b[idx]; t[1] = b[1+idx]; 468 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 469 for (i=1; i<n; i++) { 470 v = aa + 25*ai[i]; 471 vi = aj + ai[i]; 472 nz = diag[i] - ai[i]; 473 idx = 5*(*r++); 474 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 475 s5 = b[4+idx]; 476 while (nz--) { 477 idx = 5*(*vi++); 478 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 479 x4 = t[3+idx];x5 = t[4+idx]; 480 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 481 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 482 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 483 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 484 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 485 v += 25; 486 } 487 idx = 5*i; 488 t[idx] = s1;t[1+idx] = s2; 489 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 490 } 491 /* backward solve the upper triangular */ 492 for (i=n-1; i>=0; i--) { 493 v = aa + 25*diag[i] + 25; 494 vi = aj + diag[i] + 1; 495 nz = ai[i+1] - diag[i] - 1; 496 idt = 5*i; 497 s1 = t[idt]; s2 = t[1+idt]; 498 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 499 while (nz--) { 500 idx = 5*(*vi++); 501 x1 = t[idx]; x2 = t[1+idx]; 502 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 503 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 504 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 505 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 506 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 507 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 508 v += 25; 509 } 510 idc = 5*(*c--); 511 v = aa + 25*diag[i]; 512 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 513 v[15]*s4+v[20]*s5; 514 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 515 v[16]*s4+v[21]*s5; 516 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 517 v[17]*s4+v[22]*s5; 518 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 519 v[18]*s4+v[23]*s5; 520 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 521 v[19]*s4+v[24]*s5; 522 } 523 524 CHKERRQ(ISRestoreIndices(isrow,&rout)); 525 CHKERRQ(ISRestoreIndices(iscol,&cout)); 526 CHKERRQ(VecRestoreArrayRead(bb,&b)); 527 CHKERRQ(VecRestoreArray(xx,&x)); 528 CHKERRQ(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n)); 529 PetscFunctionReturn(0); 530 } 531 532 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 533 { 534 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 535 IS iscol=a->col,isrow=a->row; 536 const PetscInt *r,*c,*rout,*cout; 537 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 538 PetscInt i,nz,idx,idt,idc,m; 539 const MatScalar *aa=a->a,*v; 540 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 541 const PetscScalar *b; 542 543 PetscFunctionBegin; 544 CHKERRQ(VecGetArrayRead(bb,&b)); 545 CHKERRQ(VecGetArray(xx,&x)); 546 t = a->solve_work; 547 548 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 549 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 550 551 /* forward solve the lower triangular */ 552 idx = 5*r[0]; 553 t[0] = b[idx]; t[1] = b[1+idx]; 554 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 555 for (i=1; i<n; i++) { 556 v = aa + 25*ai[i]; 557 vi = aj + ai[i]; 558 nz = ai[i+1] - ai[i]; 559 idx = 5*r[i]; 560 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 561 s5 = b[4+idx]; 562 for (m=0; m<nz; m++) { 563 idx = 5*vi[m]; 564 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 565 x4 = t[3+idx];x5 = t[4+idx]; 566 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 567 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 568 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 569 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 570 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 571 v += 25; 572 } 573 idx = 5*i; 574 t[idx] = s1;t[1+idx] = s2; 575 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 576 } 577 /* backward solve the upper triangular */ 578 for (i=n-1; i>=0; i--) { 579 v = aa + 25*(adiag[i+1]+1); 580 vi = aj + adiag[i+1]+1; 581 nz = adiag[i] - adiag[i+1] - 1; 582 idt = 5*i; 583 s1 = t[idt]; s2 = t[1+idt]; 584 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 585 for (m=0; m<nz; m++) { 586 idx = 5*vi[m]; 587 x1 = t[idx]; x2 = t[1+idx]; 588 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 589 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 590 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 591 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 592 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 593 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 594 v += 25; 595 } 596 idc = 5*c[i]; 597 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 598 v[15]*s4+v[20]*s5; 599 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 600 v[16]*s4+v[21]*s5; 601 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 602 v[17]*s4+v[22]*s5; 603 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 604 v[18]*s4+v[23]*s5; 605 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 606 v[19]*s4+v[24]*s5; 607 } 608 609 CHKERRQ(ISRestoreIndices(isrow,&rout)); 610 CHKERRQ(ISRestoreIndices(iscol,&cout)); 611 CHKERRQ(VecRestoreArrayRead(bb,&b)); 612 CHKERRQ(VecRestoreArray(xx,&x)); 613 CHKERRQ(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n)); 614 PetscFunctionReturn(0); 615 } 616 617 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 618 { 619 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 620 IS iscol=a->col,isrow=a->row; 621 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 622 PetscInt i,nz,idx,idt,idc; 623 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 624 const MatScalar *aa=a->a,*v; 625 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 626 const PetscScalar *b; 627 628 PetscFunctionBegin; 629 CHKERRQ(VecGetArrayRead(bb,&b)); 630 CHKERRQ(VecGetArray(xx,&x)); 631 t = a->solve_work; 632 633 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 634 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 635 636 /* forward solve the lower triangular */ 637 idx = 4*(*r++); 638 t[0] = b[idx]; t[1] = b[1+idx]; 639 t[2] = b[2+idx]; t[3] = b[3+idx]; 640 for (i=1; i<n; i++) { 641 v = aa + 16*ai[i]; 642 vi = aj + ai[i]; 643 nz = diag[i] - ai[i]; 644 idx = 4*(*r++); 645 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 646 while (nz--) { 647 idx = 4*(*vi++); 648 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 649 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 650 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 651 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 652 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 653 v += 16; 654 } 655 idx = 4*i; 656 t[idx] = s1;t[1+idx] = s2; 657 t[2+idx] = s3;t[3+idx] = s4; 658 } 659 /* backward solve the upper triangular */ 660 for (i=n-1; i>=0; i--) { 661 v = aa + 16*diag[i] + 16; 662 vi = aj + diag[i] + 1; 663 nz = ai[i+1] - diag[i] - 1; 664 idt = 4*i; 665 s1 = t[idt]; s2 = t[1+idt]; 666 s3 = t[2+idt];s4 = t[3+idt]; 667 while (nz--) { 668 idx = 4*(*vi++); 669 x1 = t[idx]; x2 = t[1+idx]; 670 x3 = t[2+idx]; x4 = t[3+idx]; 671 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 672 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 673 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 674 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 675 v += 16; 676 } 677 idc = 4*(*c--); 678 v = aa + 16*diag[i]; 679 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 680 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 681 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 682 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 683 } 684 685 CHKERRQ(ISRestoreIndices(isrow,&rout)); 686 CHKERRQ(ISRestoreIndices(iscol,&cout)); 687 CHKERRQ(VecRestoreArrayRead(bb,&b)); 688 CHKERRQ(VecRestoreArray(xx,&x)); 689 CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 690 PetscFunctionReturn(0); 691 } 692 693 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 694 { 695 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 696 IS iscol=a->col,isrow=a->row; 697 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 698 PetscInt i,nz,idx,idt,idc,m; 699 const PetscInt *r,*c,*rout,*cout; 700 const MatScalar *aa=a->a,*v; 701 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 702 const PetscScalar *b; 703 704 PetscFunctionBegin; 705 CHKERRQ(VecGetArrayRead(bb,&b)); 706 CHKERRQ(VecGetArray(xx,&x)); 707 t = a->solve_work; 708 709 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 710 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 711 712 /* forward solve the lower triangular */ 713 idx = 4*r[0]; 714 t[0] = b[idx]; t[1] = b[1+idx]; 715 t[2] = b[2+idx]; t[3] = b[3+idx]; 716 for (i=1; i<n; i++) { 717 v = aa + 16*ai[i]; 718 vi = aj + ai[i]; 719 nz = ai[i+1] - ai[i]; 720 idx = 4*r[i]; 721 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 722 for (m=0; m<nz; m++) { 723 idx = 4*vi[m]; 724 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 725 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 726 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 727 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 728 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 729 v += 16; 730 } 731 idx = 4*i; 732 t[idx] = s1;t[1+idx] = s2; 733 t[2+idx] = s3;t[3+idx] = s4; 734 } 735 /* backward solve the upper triangular */ 736 for (i=n-1; i>=0; i--) { 737 v = aa + 16*(adiag[i+1]+1); 738 vi = aj + adiag[i+1]+1; 739 nz = adiag[i] - adiag[i+1] - 1; 740 idt = 4*i; 741 s1 = t[idt]; s2 = t[1+idt]; 742 s3 = t[2+idt];s4 = t[3+idt]; 743 for (m=0; m<nz; m++) { 744 idx = 4*vi[m]; 745 x1 = t[idx]; x2 = t[1+idx]; 746 x3 = t[2+idx]; x4 = t[3+idx]; 747 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 748 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 749 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 750 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 751 v += 16; 752 } 753 idc = 4*c[i]; 754 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 755 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 756 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 757 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 758 } 759 760 CHKERRQ(ISRestoreIndices(isrow,&rout)); 761 CHKERRQ(ISRestoreIndices(iscol,&cout)); 762 CHKERRQ(VecRestoreArrayRead(bb,&b)); 763 CHKERRQ(VecRestoreArray(xx,&x)); 764 CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 765 PetscFunctionReturn(0); 766 } 767 768 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 769 { 770 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 771 IS iscol=a->col,isrow=a->row; 772 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 773 PetscInt i,nz,idx,idt,idc; 774 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 775 const MatScalar *aa=a->a,*v; 776 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 777 PetscScalar *x; 778 const PetscScalar *b; 779 780 PetscFunctionBegin; 781 CHKERRQ(VecGetArrayRead(bb,&b)); 782 CHKERRQ(VecGetArray(xx,&x)); 783 t = (MatScalar*)a->solve_work; 784 785 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 786 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 787 788 /* forward solve the lower triangular */ 789 idx = 4*(*r++); 790 t[0] = (MatScalar)b[idx]; 791 t[1] = (MatScalar)b[1+idx]; 792 t[2] = (MatScalar)b[2+idx]; 793 t[3] = (MatScalar)b[3+idx]; 794 for (i=1; i<n; i++) { 795 v = aa + 16*ai[i]; 796 vi = aj + ai[i]; 797 nz = diag[i] - ai[i]; 798 idx = 4*(*r++); 799 s1 = (MatScalar)b[idx]; 800 s2 = (MatScalar)b[1+idx]; 801 s3 = (MatScalar)b[2+idx]; 802 s4 = (MatScalar)b[3+idx]; 803 while (nz--) { 804 idx = 4*(*vi++); 805 x1 = t[idx]; 806 x2 = t[1+idx]; 807 x3 = t[2+idx]; 808 x4 = t[3+idx]; 809 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 810 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 811 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 812 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 813 v += 16; 814 } 815 idx = 4*i; 816 t[idx] = s1; 817 t[1+idx] = s2; 818 t[2+idx] = s3; 819 t[3+idx] = s4; 820 } 821 /* backward solve the upper triangular */ 822 for (i=n-1; i>=0; i--) { 823 v = aa + 16*diag[i] + 16; 824 vi = aj + diag[i] + 1; 825 nz = ai[i+1] - diag[i] - 1; 826 idt = 4*i; 827 s1 = t[idt]; 828 s2 = t[1+idt]; 829 s3 = t[2+idt]; 830 s4 = t[3+idt]; 831 while (nz--) { 832 idx = 4*(*vi++); 833 x1 = t[idx]; 834 x2 = t[1+idx]; 835 x3 = t[2+idx]; 836 x4 = t[3+idx]; 837 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 838 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 839 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 840 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 841 v += 16; 842 } 843 idc = 4*(*c--); 844 v = aa + 16*diag[i]; 845 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 846 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 847 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 848 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 849 x[idc] = (PetscScalar)t[idt]; 850 x[1+idc] = (PetscScalar)t[1+idt]; 851 x[2+idc] = (PetscScalar)t[2+idt]; 852 x[3+idc] = (PetscScalar)t[3+idt]; 853 } 854 855 CHKERRQ(ISRestoreIndices(isrow,&rout)); 856 CHKERRQ(ISRestoreIndices(iscol,&cout)); 857 CHKERRQ(VecRestoreArrayRead(bb,&b)); 858 CHKERRQ(VecRestoreArray(xx,&x)); 859 CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 860 PetscFunctionReturn(0); 861 } 862 863 #if defined(PETSC_HAVE_SSE) 864 865 #include PETSC_HAVE_SSE 866 867 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 868 { 869 /* 870 Note: This code uses demotion of double 871 to float when performing the mixed-mode computation. 872 This may not be numerically reasonable for all applications. 873 */ 874 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 875 IS iscol=a->col,isrow=a->row; 876 PetscErrorCode ierr; 877 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 878 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 879 MatScalar *aa=a->a,*v; 880 PetscScalar *x,*b,*t; 881 882 /* Make space in temp stack for 16 Byte Aligned arrays */ 883 float ssealignedspace[11],*tmps,*tmpx; 884 unsigned long offset; 885 886 PetscFunctionBegin; 887 SSE_SCOPE_BEGIN; 888 889 offset = (unsigned long)ssealignedspace % 16; 890 if (offset) offset = (16 - offset)/4; 891 tmps = &ssealignedspace[offset]; 892 tmpx = &ssealignedspace[offset+4]; 893 PREFETCH_NTA(aa+16*ai[1]); 894 895 CHKERRQ(VecGetArray(bb,&b)); 896 CHKERRQ(VecGetArray(xx,&x)); 897 t = a->solve_work; 898 899 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 900 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 901 902 /* forward solve the lower triangular */ 903 idx = 4*(*r++); 904 t[0] = b[idx]; t[1] = b[1+idx]; 905 t[2] = b[2+idx]; t[3] = b[3+idx]; 906 v = aa + 16*ai[1]; 907 908 for (i=1; i<n;) { 909 PREFETCH_NTA(&v[8]); 910 vi = aj + ai[i]; 911 nz = diag[i] - ai[i]; 912 idx = 4*(*r++); 913 914 /* Demote sum from double to float */ 915 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 916 LOAD_PS(tmps,XMM7); 917 918 while (nz--) { 919 PREFETCH_NTA(&v[16]); 920 idx = 4*(*vi++); 921 922 /* Demote solution (so far) from double to float */ 923 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 924 925 /* 4x4 Matrix-Vector product with negative accumulation: */ 926 SSE_INLINE_BEGIN_2(tmpx,v) 927 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 928 929 /* First Column */ 930 SSE_COPY_PS(XMM0,XMM6) 931 SSE_SHUFFLE(XMM0,XMM0,0x00) 932 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 933 SSE_SUB_PS(XMM7,XMM0) 934 935 /* Second Column */ 936 SSE_COPY_PS(XMM1,XMM6) 937 SSE_SHUFFLE(XMM1,XMM1,0x55) 938 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 939 SSE_SUB_PS(XMM7,XMM1) 940 941 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 942 943 /* Third Column */ 944 SSE_COPY_PS(XMM2,XMM6) 945 SSE_SHUFFLE(XMM2,XMM2,0xAA) 946 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 947 SSE_SUB_PS(XMM7,XMM2) 948 949 /* Fourth Column */ 950 SSE_COPY_PS(XMM3,XMM6) 951 SSE_SHUFFLE(XMM3,XMM3,0xFF) 952 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 953 SSE_SUB_PS(XMM7,XMM3) 954 SSE_INLINE_END_2 955 956 v += 16; 957 } 958 idx = 4*i; 959 v = aa + 16*ai[++i]; 960 PREFETCH_NTA(v); 961 STORE_PS(tmps,XMM7); 962 963 /* Promote result from float to double */ 964 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 965 } 966 /* backward solve the upper triangular */ 967 idt = 4*(n-1); 968 ai16 = 16*diag[n-1]; 969 v = aa + ai16 + 16; 970 for (i=n-1; i>=0;) { 971 PREFETCH_NTA(&v[8]); 972 vi = aj + diag[i] + 1; 973 nz = ai[i+1] - diag[i] - 1; 974 975 /* Demote accumulator from double to float */ 976 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 977 LOAD_PS(tmps,XMM7); 978 979 while (nz--) { 980 PREFETCH_NTA(&v[16]); 981 idx = 4*(*vi++); 982 983 /* Demote solution (so far) from double to float */ 984 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 985 986 /* 4x4 Matrix-Vector Product with negative accumulation: */ 987 SSE_INLINE_BEGIN_2(tmpx,v) 988 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 989 990 /* First Column */ 991 SSE_COPY_PS(XMM0,XMM6) 992 SSE_SHUFFLE(XMM0,XMM0,0x00) 993 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 994 SSE_SUB_PS(XMM7,XMM0) 995 996 /* Second Column */ 997 SSE_COPY_PS(XMM1,XMM6) 998 SSE_SHUFFLE(XMM1,XMM1,0x55) 999 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1000 SSE_SUB_PS(XMM7,XMM1) 1001 1002 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1003 1004 /* Third Column */ 1005 SSE_COPY_PS(XMM2,XMM6) 1006 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1007 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1008 SSE_SUB_PS(XMM7,XMM2) 1009 1010 /* Fourth Column */ 1011 SSE_COPY_PS(XMM3,XMM6) 1012 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1013 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1014 SSE_SUB_PS(XMM7,XMM3) 1015 SSE_INLINE_END_2 1016 v += 16; 1017 } 1018 v = aa + ai16; 1019 ai16 = 16*diag[--i]; 1020 PREFETCH_NTA(aa+ai16+16); 1021 /* 1022 Scale the result by the diagonal 4x4 block, 1023 which was inverted as part of the factorization 1024 */ 1025 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1026 /* First Column */ 1027 SSE_COPY_PS(XMM0,XMM7) 1028 SSE_SHUFFLE(XMM0,XMM0,0x00) 1029 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1030 1031 /* Second Column */ 1032 SSE_COPY_PS(XMM1,XMM7) 1033 SSE_SHUFFLE(XMM1,XMM1,0x55) 1034 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1035 SSE_ADD_PS(XMM0,XMM1) 1036 1037 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1038 1039 /* Third Column */ 1040 SSE_COPY_PS(XMM2,XMM7) 1041 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1042 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1043 SSE_ADD_PS(XMM0,XMM2) 1044 1045 /* Fourth Column */ 1046 SSE_COPY_PS(XMM3,XMM7) 1047 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1048 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1049 SSE_ADD_PS(XMM0,XMM3) 1050 1051 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1052 SSE_INLINE_END_3 1053 1054 /* Promote solution from float to double */ 1055 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1056 1057 /* Apply reordering to t and stream into x. */ 1058 /* This way, x doesn't pollute the cache. */ 1059 /* Be careful with size: 2 doubles = 4 floats! */ 1060 idc = 4*(*c--); 1061 SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 1062 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1063 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1064 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1065 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1066 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1067 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1068 SSE_INLINE_END_2 1069 v = aa + ai16 + 16; 1070 idt -= 4; 1071 } 1072 1073 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1074 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1075 CHKERRQ(VecRestoreArray(bb,&b)); 1076 CHKERRQ(VecRestoreArray(xx,&x)); 1077 CHKERRQ(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 1078 SSE_SCOPE_END; 1079 PetscFunctionReturn(0); 1080 } 1081 1082 #endif 1083 1084 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1085 { 1086 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1087 IS iscol=a->col,isrow=a->row; 1088 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1089 PetscInt i,nz,idx,idt,idc; 1090 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1091 const MatScalar *aa=a->a,*v; 1092 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1093 const PetscScalar *b; 1094 1095 PetscFunctionBegin; 1096 CHKERRQ(VecGetArrayRead(bb,&b)); 1097 CHKERRQ(VecGetArray(xx,&x)); 1098 t = a->solve_work; 1099 1100 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 1101 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 1102 1103 /* forward solve the lower triangular */ 1104 idx = 3*(*r++); 1105 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1106 for (i=1; i<n; i++) { 1107 v = aa + 9*ai[i]; 1108 vi = aj + ai[i]; 1109 nz = diag[i] - ai[i]; 1110 idx = 3*(*r++); 1111 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1112 while (nz--) { 1113 idx = 3*(*vi++); 1114 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1115 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1116 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1117 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1118 v += 9; 1119 } 1120 idx = 3*i; 1121 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1122 } 1123 /* backward solve the upper triangular */ 1124 for (i=n-1; i>=0; i--) { 1125 v = aa + 9*diag[i] + 9; 1126 vi = aj + diag[i] + 1; 1127 nz = ai[i+1] - diag[i] - 1; 1128 idt = 3*i; 1129 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1130 while (nz--) { 1131 idx = 3*(*vi++); 1132 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1133 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1134 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1135 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1136 v += 9; 1137 } 1138 idc = 3*(*c--); 1139 v = aa + 9*diag[i]; 1140 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1141 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1142 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1143 } 1144 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1145 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1146 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1147 CHKERRQ(VecRestoreArray(xx,&x)); 1148 CHKERRQ(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n)); 1149 PetscFunctionReturn(0); 1150 } 1151 1152 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1153 { 1154 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1155 IS iscol=a->col,isrow=a->row; 1156 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1157 PetscInt i,nz,idx,idt,idc,m; 1158 const PetscInt *r,*c,*rout,*cout; 1159 const MatScalar *aa=a->a,*v; 1160 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1161 const PetscScalar *b; 1162 1163 PetscFunctionBegin; 1164 CHKERRQ(VecGetArrayRead(bb,&b)); 1165 CHKERRQ(VecGetArray(xx,&x)); 1166 t = a->solve_work; 1167 1168 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 1169 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 1170 1171 /* forward solve the lower triangular */ 1172 idx = 3*r[0]; 1173 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1174 for (i=1; i<n; i++) { 1175 v = aa + 9*ai[i]; 1176 vi = aj + ai[i]; 1177 nz = ai[i+1] - ai[i]; 1178 idx = 3*r[i]; 1179 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1180 for (m=0; m<nz; m++) { 1181 idx = 3*vi[m]; 1182 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1183 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1184 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1185 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1186 v += 9; 1187 } 1188 idx = 3*i; 1189 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1190 } 1191 /* backward solve the upper triangular */ 1192 for (i=n-1; i>=0; i--) { 1193 v = aa + 9*(adiag[i+1]+1); 1194 vi = aj + adiag[i+1]+1; 1195 nz = adiag[i] - adiag[i+1] - 1; 1196 idt = 3*i; 1197 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1198 for (m=0; m<nz; m++) { 1199 idx = 3*vi[m]; 1200 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1201 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1202 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1203 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1204 v += 9; 1205 } 1206 idc = 3*c[i]; 1207 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1208 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1209 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1210 } 1211 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1212 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1213 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1214 CHKERRQ(VecRestoreArray(xx,&x)); 1215 CHKERRQ(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n)); 1216 PetscFunctionReturn(0); 1217 } 1218 1219 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1220 { 1221 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1222 IS iscol=a->col,isrow=a->row; 1223 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1224 PetscInt i,nz,idx,idt,idc; 1225 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1226 const MatScalar *aa=a->a,*v; 1227 PetscScalar *x,s1,s2,x1,x2,*t; 1228 const PetscScalar *b; 1229 1230 PetscFunctionBegin; 1231 CHKERRQ(VecGetArrayRead(bb,&b)); 1232 CHKERRQ(VecGetArray(xx,&x)); 1233 t = a->solve_work; 1234 1235 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 1236 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 1237 1238 /* forward solve the lower triangular */ 1239 idx = 2*(*r++); 1240 t[0] = b[idx]; t[1] = b[1+idx]; 1241 for (i=1; i<n; i++) { 1242 v = aa + 4*ai[i]; 1243 vi = aj + ai[i]; 1244 nz = diag[i] - ai[i]; 1245 idx = 2*(*r++); 1246 s1 = b[idx]; s2 = b[1+idx]; 1247 while (nz--) { 1248 idx = 2*(*vi++); 1249 x1 = t[idx]; x2 = t[1+idx]; 1250 s1 -= v[0]*x1 + v[2]*x2; 1251 s2 -= v[1]*x1 + v[3]*x2; 1252 v += 4; 1253 } 1254 idx = 2*i; 1255 t[idx] = s1; t[1+idx] = s2; 1256 } 1257 /* backward solve the upper triangular */ 1258 for (i=n-1; i>=0; i--) { 1259 v = aa + 4*diag[i] + 4; 1260 vi = aj + diag[i] + 1; 1261 nz = ai[i+1] - diag[i] - 1; 1262 idt = 2*i; 1263 s1 = t[idt]; s2 = t[1+idt]; 1264 while (nz--) { 1265 idx = 2*(*vi++); 1266 x1 = t[idx]; x2 = t[1+idx]; 1267 s1 -= v[0]*x1 + v[2]*x2; 1268 s2 -= v[1]*x1 + v[3]*x2; 1269 v += 4; 1270 } 1271 idc = 2*(*c--); 1272 v = aa + 4*diag[i]; 1273 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1274 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1275 } 1276 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1277 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1278 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1279 CHKERRQ(VecRestoreArray(xx,&x)); 1280 CHKERRQ(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n)); 1281 PetscFunctionReturn(0); 1282 } 1283 1284 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1285 { 1286 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1287 IS iscol=a->col,isrow=a->row; 1288 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1289 PetscInt i,nz,idx,jdx,idt,idc,m; 1290 const PetscInt *r,*c,*rout,*cout; 1291 const MatScalar *aa=a->a,*v; 1292 PetscScalar *x,s1,s2,x1,x2,*t; 1293 const PetscScalar *b; 1294 1295 PetscFunctionBegin; 1296 CHKERRQ(VecGetArrayRead(bb,&b)); 1297 CHKERRQ(VecGetArray(xx,&x)); 1298 t = a->solve_work; 1299 1300 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 1301 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 1302 1303 /* forward solve the lower triangular */ 1304 idx = 2*r[0]; 1305 t[0] = b[idx]; t[1] = b[1+idx]; 1306 for (i=1; i<n; i++) { 1307 v = aa + 4*ai[i]; 1308 vi = aj + ai[i]; 1309 nz = ai[i+1] - ai[i]; 1310 idx = 2*r[i]; 1311 s1 = b[idx]; s2 = b[1+idx]; 1312 for (m=0; m<nz; m++) { 1313 jdx = 2*vi[m]; 1314 x1 = t[jdx]; x2 = t[1+jdx]; 1315 s1 -= v[0]*x1 + v[2]*x2; 1316 s2 -= v[1]*x1 + v[3]*x2; 1317 v += 4; 1318 } 1319 idx = 2*i; 1320 t[idx] = s1; t[1+idx] = s2; 1321 } 1322 /* backward solve the upper triangular */ 1323 for (i=n-1; i>=0; i--) { 1324 v = aa + 4*(adiag[i+1]+1); 1325 vi = aj + adiag[i+1]+1; 1326 nz = adiag[i] - adiag[i+1] - 1; 1327 idt = 2*i; 1328 s1 = t[idt]; s2 = t[1+idt]; 1329 for (m=0; m<nz; m++) { 1330 idx = 2*vi[m]; 1331 x1 = t[idx]; x2 = t[1+idx]; 1332 s1 -= v[0]*x1 + v[2]*x2; 1333 s2 -= v[1]*x1 + v[3]*x2; 1334 v += 4; 1335 } 1336 idc = 2*c[i]; 1337 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1338 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1339 } 1340 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1341 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1342 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1343 CHKERRQ(VecRestoreArray(xx,&x)); 1344 CHKERRQ(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n)); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1349 { 1350 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1351 IS iscol=a->col,isrow=a->row; 1352 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1353 PetscInt i,nz; 1354 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1355 const MatScalar *aa=a->a,*v; 1356 PetscScalar *x,s1,*t; 1357 const PetscScalar *b; 1358 1359 PetscFunctionBegin; 1360 if (!n) PetscFunctionReturn(0); 1361 1362 CHKERRQ(VecGetArrayRead(bb,&b)); 1363 CHKERRQ(VecGetArray(xx,&x)); 1364 t = a->solve_work; 1365 1366 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 1367 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout + (n-1); 1368 1369 /* forward solve the lower triangular */ 1370 t[0] = b[*r++]; 1371 for (i=1; i<n; i++) { 1372 v = aa + ai[i]; 1373 vi = aj + ai[i]; 1374 nz = diag[i] - ai[i]; 1375 s1 = b[*r++]; 1376 while (nz--) { 1377 s1 -= (*v++)*t[*vi++]; 1378 } 1379 t[i] = s1; 1380 } 1381 /* backward solve the upper triangular */ 1382 for (i=n-1; i>=0; i--) { 1383 v = aa + diag[i] + 1; 1384 vi = aj + diag[i] + 1; 1385 nz = ai[i+1] - diag[i] - 1; 1386 s1 = t[i]; 1387 while (nz--) { 1388 s1 -= (*v++)*t[*vi++]; 1389 } 1390 x[*c--] = t[i] = aa[diag[i]]*s1; 1391 } 1392 1393 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1394 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1395 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1396 CHKERRQ(VecRestoreArray(xx,&x)); 1397 CHKERRQ(PetscLogFlops(2.0*1*(a->nz) - A->cmap->n)); 1398 PetscFunctionReturn(0); 1399 } 1400 1401 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 1402 { 1403 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1404 IS iscol = a->col,isrow = a->row; 1405 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 1406 const PetscInt *rout,*cout,*r,*c; 1407 PetscScalar *x,*tmp,sum; 1408 const PetscScalar *b; 1409 const MatScalar *aa = a->a,*v; 1410 1411 PetscFunctionBegin; 1412 if (!n) PetscFunctionReturn(0); 1413 1414 CHKERRQ(VecGetArrayRead(bb,&b)); 1415 CHKERRQ(VecGetArray(xx,&x)); 1416 tmp = a->solve_work; 1417 1418 CHKERRQ(ISGetIndices(isrow,&rout)); r = rout; 1419 CHKERRQ(ISGetIndices(iscol,&cout)); c = cout; 1420 1421 /* forward solve the lower triangular */ 1422 tmp[0] = b[r[0]]; 1423 v = aa; 1424 vi = aj; 1425 for (i=1; i<n; i++) { 1426 nz = ai[i+1] - ai[i]; 1427 sum = b[r[i]]; 1428 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1429 tmp[i] = sum; 1430 v += nz; vi += nz; 1431 } 1432 1433 /* backward solve the upper triangular */ 1434 for (i=n-1; i>=0; i--) { 1435 v = aa + adiag[i+1]+1; 1436 vi = aj + adiag[i+1]+1; 1437 nz = adiag[i]-adiag[i+1]-1; 1438 sum = tmp[i]; 1439 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1440 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 1441 } 1442 1443 CHKERRQ(ISRestoreIndices(isrow,&rout)); 1444 CHKERRQ(ISRestoreIndices(iscol,&cout)); 1445 CHKERRQ(VecRestoreArrayRead(bb,&b)); 1446 CHKERRQ(VecRestoreArray(xx,&x)); 1447 CHKERRQ(PetscLogFlops(2.0*a->nz - A->cmap->n)); 1448 PetscFunctionReturn(0); 1449 } 1450