1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 763 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 /* ----------------------------------------------------------- */ 768 #undef __FUNCT__ 769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 771 { 772 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 773 IS iscol = a->col,isrow = a->row; 774 PetscErrorCode ierr; 775 const PetscInt *r,*c,*rout,*cout; 776 PetscInt i,j,n = A->rmap->n,nz; 777 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 778 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 779 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 780 PetscScalar sum1,sum2,sum3,sum4,sum5; 781 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 782 const PetscScalar *b; 783 784 PetscFunctionBegin; 785 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 786 node_max = a->inode.node_count; 787 ns = a->inode.size; /* Node Size array */ 788 789 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 790 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 791 tmp = a->solve_work; 792 793 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 794 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 795 796 /* forward solve the lower triangular */ 797 tmps = tmp; 798 aa = a_a; 799 aj = a_j; 800 ad = a->diag; 801 802 for (i = 0,row = 0; i< node_max; ++i) { 803 nsz = ns[i]; 804 aii = ai[row]; 805 v1 = aa + aii; 806 vi = aj + aii; 807 nz = ad[row]- aii; 808 if (i < node_max-1) { 809 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 810 * but our indexing to determine it's size could. */ 811 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 812 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 813 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 814 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 815 } 816 817 switch (nsz) { /* Each loop in 'case' is unrolled */ 818 case 1: 819 sum1 = b[*r++]; 820 for (j=0; j<nz-1; j+=2) { 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if (j == nz-1) { 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for (j=0; j<nz-1; j+=2) { 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if (j == nz-1) { 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ *sum1; 854 tmp[row++]=sum1; 855 tmp[row++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2) { 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1) { 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 884 tmp[row++]=sum1; 885 tmp[row++]=sum2; 886 tmp[row++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2) { 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1) { 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row++]=sum1; 924 tmp[row++]=sum2; 925 tmp[row++]=sum3; 926 tmp[row++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2) { 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1) { 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row++]=sum1; 972 tmp[row++]=sum2; 973 tmp[row++]=sum3; 974 tmp[row++]=sum4; 975 tmp[row++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1,row = n-1; i>=0; i--) { 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz) { /* Each loop in 'case' is unrolled */ 989 case 1: 990 sum1 = tmp[row]; 991 992 for (j=nz; j>1; j-=2) { 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1) { 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2: 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz; j>1; j-=2) { 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1) { 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3: 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz; j>1; j-=2) { 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1) { 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4: 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz; j>1; j-=2) { 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1) { 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5: 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz; j>1; j-=2) { 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1) { 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1175 { 1176 Mat C =B; 1177 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1178 IS isrow = b->row,isicol = b->icol; 1179 PetscErrorCode ierr; 1180 const PetscInt *r,*ic,*ics; 1181 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1182 PetscInt i,j,k,nz,nzL,row,*pj; 1183 const PetscInt *ajtmp,*bjtmp; 1184 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1185 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1186 FactorShiftCtx sctx; 1187 const PetscInt *ddiag; 1188 PetscReal rs; 1189 MatScalar d; 1190 PetscInt inod,nodesz,node_max,col; 1191 const PetscInt *ns; 1192 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1193 1194 PetscFunctionBegin; 1195 /* MatPivotSetUp(): initialize shift context sctx */ 1196 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1197 1198 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1199 ddiag = a->diag; 1200 sctx.shift_top = info->zeropivot; 1201 for (i=0; i<n; i++) { 1202 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1203 d = (aa)[ddiag[i]]; 1204 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1205 v = aa+ai[i]; 1206 nz = ai[i+1] - ai[i]; 1207 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1208 if (rs>sctx.shift_top) sctx.shift_top = rs; 1209 } 1210 sctx.shift_top *= 1.1; 1211 sctx.nshift_max = 5; 1212 sctx.shift_lo = 0.; 1213 sctx.shift_hi = 1.; 1214 } 1215 1216 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1217 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1218 1219 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1220 ics = ic; 1221 1222 node_max = a->inode.node_count; 1223 ns = a->inode.size; 1224 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1225 1226 /* If max inode size > 4, split it into two inodes.*/ 1227 /* also map the inode sizes according to the ordering */ 1228 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1229 for (i=0,j=0; i<node_max; ++i,++j) { 1230 if (ns[i] > 4) { 1231 tmp_vec1[j] = 4; 1232 ++j; 1233 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1234 } else { 1235 tmp_vec1[j] = ns[i]; 1236 } 1237 } 1238 /* Use the correct node_max */ 1239 node_max = j; 1240 1241 /* Now reorder the inode info based on mat re-ordering info */ 1242 /* First create a row -> inode_size_array_index map */ 1243 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1244 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1245 for (i=0,row=0; i<node_max; i++) { 1246 nodesz = tmp_vec1[i]; 1247 for (j=0; j<nodesz; j++,row++) { 1248 nsmap[row] = i; 1249 } 1250 } 1251 /* Using nsmap, create a reordered ns structure */ 1252 for (i=0,j=0; i< node_max; i++) { 1253 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1254 tmp_vec2[i] = nodesz; 1255 j += nodesz; 1256 } 1257 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1258 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1259 1260 /* Now use the correct ns */ 1261 ns = tmp_vec2; 1262 1263 do { 1264 sctx.newshift = PETSC_FALSE; 1265 /* Now loop over each block-row, and do the factorization */ 1266 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1267 nodesz = ns[inod]; 1268 1269 switch (nodesz) { 1270 case 1: 1271 /*----------*/ 1272 /* zero rtmp1 */ 1273 /* L part */ 1274 nz = bi[i+1] - bi[i]; 1275 bjtmp = bj + bi[i]; 1276 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1277 1278 /* U part */ 1279 nz = bdiag[i]-bdiag[i+1]; 1280 bjtmp = bj + bdiag[i+1]+1; 1281 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1282 1283 /* load in initial (unfactored row) */ 1284 nz = ai[r[i]+1] - ai[r[i]]; 1285 ajtmp = aj + ai[r[i]]; 1286 v = aa + ai[r[i]]; 1287 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1288 1289 /* ZeropivotApply() */ 1290 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1291 1292 /* elimination */ 1293 bjtmp = bj + bi[i]; 1294 row = *bjtmp++; 1295 nzL = bi[i+1] - bi[i]; 1296 for (k=0; k < nzL; k++) { 1297 pc = rtmp1 + row; 1298 if (*pc != 0.0) { 1299 pv = b->a + bdiag[row]; 1300 mul1 = *pc * (*pv); 1301 *pc = mul1; 1302 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1303 pv = b->a + bdiag[row+1]+1; 1304 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1305 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1306 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1307 } 1308 row = *bjtmp++; 1309 } 1310 1311 /* finished row so stick it into b->a */ 1312 rs = 0.0; 1313 /* L part */ 1314 pv = b->a + bi[i]; 1315 pj = b->j + bi[i]; 1316 nz = bi[i+1] - bi[i]; 1317 for (j=0; j<nz; j++) { 1318 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1319 } 1320 1321 /* U part */ 1322 pv = b->a + bdiag[i+1]+1; 1323 pj = b->j + bdiag[i+1]+1; 1324 nz = bdiag[i] - bdiag[i+1]-1; 1325 for (j=0; j<nz; j++) { 1326 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* Check zero pivot */ 1330 sctx.rs = rs; 1331 sctx.pv = rtmp1[i]; 1332 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1333 if (sctx.newshift) break; 1334 1335 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1336 pv = b->a + bdiag[i]; 1337 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1338 break; 1339 1340 case 2: 1341 /*----------*/ 1342 /* zero rtmp1 and rtmp2 */ 1343 /* L part */ 1344 nz = bi[i+1] - bi[i]; 1345 bjtmp = bj + bi[i]; 1346 for (j=0; j<nz; j++) { 1347 col = bjtmp[j]; 1348 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1349 } 1350 1351 /* U part */ 1352 nz = bdiag[i]-bdiag[i+1]; 1353 bjtmp = bj + bdiag[i+1]+1; 1354 for (j=0; j<nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1357 } 1358 1359 /* load in initial (unfactored row) */ 1360 nz = ai[r[i]+1] - ai[r[i]]; 1361 ajtmp = aj + ai[r[i]]; 1362 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1363 for (j=0; j<nz; j++) { 1364 col = ics[ajtmp[j]]; 1365 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1366 } 1367 /* ZeropivotApply(): shift the diagonal of the matrix */ 1368 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1369 1370 /* elimination */ 1371 bjtmp = bj + bi[i]; 1372 row = *bjtmp++; /* pivot row */ 1373 nzL = bi[i+1] - bi[i]; 1374 for (k=0; k < nzL; k++) { 1375 pc1 = rtmp1 + row; 1376 pc2 = rtmp2 + row; 1377 if (*pc1 != 0.0 || *pc2 != 0.0) { 1378 pv = b->a + bdiag[row]; 1379 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1380 *pc1 = mul1; *pc2 = mul2; 1381 1382 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1383 pv = b->a + bdiag[row+1]+1; 1384 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1385 for (j=0; j<nz; j++) { 1386 col = pj[j]; 1387 rtmp1[col] -= mul1 * pv[j]; 1388 rtmp2[col] -= mul2 * pv[j]; 1389 } 1390 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1391 } 1392 row = *bjtmp++; 1393 } 1394 1395 /* finished row i; check zero pivot, then stick row i into b->a */ 1396 rs = 0.0; 1397 /* L part */ 1398 pc1 = b->a + bi[i]; 1399 pj = b->j + bi[i]; 1400 nz = bi[i+1] - bi[i]; 1401 for (j=0; j<nz; j++) { 1402 col = pj[j]; 1403 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1404 } 1405 /* U part */ 1406 pc1 = b->a + bdiag[i+1]+1; 1407 pj = b->j + bdiag[i+1]+1; 1408 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1409 for (j=0; j<nz; j++) { 1410 col = pj[j]; 1411 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1412 } 1413 1414 sctx.rs = rs; 1415 sctx.pv = rtmp1[i]; 1416 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1417 if (sctx.newshift) break; 1418 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1419 *pc1 = 1.0/sctx.pv; 1420 1421 /* Now take care of diagonal 2x2 block. */ 1422 pc2 = rtmp2 + i; 1423 if (*pc2 != 0.0) { 1424 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1425 *pc2 = mul1; /* insert L entry */ 1426 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1427 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1430 } 1431 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1432 } 1433 1434 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1435 rs = 0.0; 1436 /* L part */ 1437 pc2 = b->a + bi[i+1]; 1438 pj = b->j + bi[i+1]; 1439 nz = bi[i+2] - bi[i+1]; 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; 1442 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1443 } 1444 /* U part */ 1445 pc2 = b->a + bdiag[i+2]+1; 1446 pj = b->j + bdiag[i+2]+1; 1447 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1448 for (j=0; j<nz; j++) { 1449 col = pj[j]; 1450 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1451 } 1452 1453 sctx.rs = rs; 1454 sctx.pv = rtmp2[i+1]; 1455 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1456 if (sctx.newshift) break; 1457 pc2 = b->a + bdiag[i+1]; 1458 *pc2 = 1.0/sctx.pv; 1459 break; 1460 1461 case 3: 1462 /*----------*/ 1463 /* zero rtmp */ 1464 /* L part */ 1465 nz = bi[i+1] - bi[i]; 1466 bjtmp = bj + bi[i]; 1467 for (j=0; j<nz; j++) { 1468 col = bjtmp[j]; 1469 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1470 } 1471 1472 /* U part */ 1473 nz = bdiag[i]-bdiag[i+1]; 1474 bjtmp = bj + bdiag[i+1]+1; 1475 for (j=0; j<nz; j++) { 1476 col = bjtmp[j]; 1477 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1478 } 1479 1480 /* load in initial (unfactored row) */ 1481 nz = ai[r[i]+1] - ai[r[i]]; 1482 ajtmp = aj + ai[r[i]]; 1483 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1484 for (j=0; j<nz; j++) { 1485 col = ics[ajtmp[j]]; 1486 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1487 } 1488 /* ZeropivotApply(): shift the diagonal of the matrix */ 1489 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1490 1491 /* elimination */ 1492 bjtmp = bj + bi[i]; 1493 row = *bjtmp++; /* pivot row */ 1494 nzL = bi[i+1] - bi[i]; 1495 for (k=0; k < nzL; k++) { 1496 pc1 = rtmp1 + row; 1497 pc2 = rtmp2 + row; 1498 pc3 = rtmp3 + row; 1499 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1500 pv = b->a + bdiag[row]; 1501 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1502 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1503 1504 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1505 pv = b->a + bdiag[row+1]+1; 1506 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1507 for (j=0; j<nz; j++) { 1508 col = pj[j]; 1509 rtmp1[col] -= mul1 * pv[j]; 1510 rtmp2[col] -= mul2 * pv[j]; 1511 rtmp3[col] -= mul3 * pv[j]; 1512 } 1513 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1514 } 1515 row = *bjtmp++; 1516 } 1517 1518 /* finished row i; check zero pivot, then stick row i into b->a */ 1519 rs = 0.0; 1520 /* L part */ 1521 pc1 = b->a + bi[i]; 1522 pj = b->j + bi[i]; 1523 nz = bi[i+1] - bi[i]; 1524 for (j=0; j<nz; j++) { 1525 col = pj[j]; 1526 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1527 } 1528 /* U part */ 1529 pc1 = b->a + bdiag[i+1]+1; 1530 pj = b->j + bdiag[i+1]+1; 1531 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1535 } 1536 1537 sctx.rs = rs; 1538 sctx.pv = rtmp1[i]; 1539 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1540 if (sctx.newshift) break; 1541 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1542 *pc1 = 1.0/sctx.pv; 1543 1544 /* Now take care of 1st column of diagonal 3x3 block. */ 1545 pc2 = rtmp2 + i; 1546 pc3 = rtmp3 + i; 1547 if (*pc2 != 0.0 || *pc3 != 0.0) { 1548 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1549 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1550 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1551 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1552 for (j=0; j<nz; j++) { 1553 col = pj[j]; 1554 rtmp2[col] -= mul2 * rtmp1[col]; 1555 rtmp3[col] -= mul3 * rtmp1[col]; 1556 } 1557 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1558 } 1559 1560 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1561 rs = 0.0; 1562 /* L part */ 1563 pc2 = b->a + bi[i+1]; 1564 pj = b->j + bi[i+1]; 1565 nz = bi[i+2] - bi[i+1]; 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1569 } 1570 /* U part */ 1571 pc2 = b->a + bdiag[i+2]+1; 1572 pj = b->j + bdiag[i+2]+1; 1573 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1574 for (j=0; j<nz; j++) { 1575 col = pj[j]; 1576 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1577 } 1578 1579 sctx.rs = rs; 1580 sctx.pv = rtmp2[i+1]; 1581 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1582 if (sctx.newshift) break; 1583 pc2 = b->a + bdiag[i+1]; 1584 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1585 1586 /* Now take care of 2nd column of diagonal 3x3 block. */ 1587 pc3 = rtmp3 + i+1; 1588 if (*pc3 != 0.0) { 1589 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1590 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1591 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1592 for (j=0; j<nz; j++) { 1593 col = pj[j]; 1594 rtmp3[col] -= mul3 * rtmp2[col]; 1595 } 1596 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1597 } 1598 1599 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1600 rs = 0.0; 1601 /* L part */ 1602 pc3 = b->a + bi[i+2]; 1603 pj = b->j + bi[i+2]; 1604 nz = bi[i+3] - bi[i+2]; 1605 for (j=0; j<nz; j++) { 1606 col = pj[j]; 1607 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1608 } 1609 /* U part */ 1610 pc3 = b->a + bdiag[i+3]+1; 1611 pj = b->j + bdiag[i+3]+1; 1612 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1613 for (j=0; j<nz; j++) { 1614 col = pj[j]; 1615 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1616 } 1617 1618 sctx.rs = rs; 1619 sctx.pv = rtmp3[i+2]; 1620 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1621 if (sctx.newshift) break; 1622 pc3 = b->a + bdiag[i+2]; 1623 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1624 break; 1625 case 4: 1626 /*----------*/ 1627 /* zero rtmp */ 1628 /* L part */ 1629 nz = bi[i+1] - bi[i]; 1630 bjtmp = bj + bi[i]; 1631 for (j=0; j<nz; j++) { 1632 col = bjtmp[j]; 1633 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1634 } 1635 1636 /* U part */ 1637 nz = bdiag[i]-bdiag[i+1]; 1638 bjtmp = bj + bdiag[i+1]+1; 1639 for (j=0; j<nz; j++) { 1640 col = bjtmp[j]; 1641 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1642 } 1643 1644 /* load in initial (unfactored row) */ 1645 nz = ai[r[i]+1] - ai[r[i]]; 1646 ajtmp = aj + ai[r[i]]; 1647 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1648 for (j=0; j<nz; j++) { 1649 col = ics[ajtmp[j]]; 1650 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1651 } 1652 /* ZeropivotApply(): shift the diagonal of the matrix */ 1653 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1654 1655 /* elimination */ 1656 bjtmp = bj + bi[i]; 1657 row = *bjtmp++; /* pivot row */ 1658 nzL = bi[i+1] - bi[i]; 1659 for (k=0; k < nzL; k++) { 1660 pc1 = rtmp1 + row; 1661 pc2 = rtmp2 + row; 1662 pc3 = rtmp3 + row; 1663 pc4 = rtmp4 + row; 1664 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1665 pv = b->a + bdiag[row]; 1666 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1667 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1668 1669 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1670 pv = b->a + bdiag[row+1]+1; 1671 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1672 for (j=0; j<nz; j++) { 1673 col = pj[j]; 1674 rtmp1[col] -= mul1 * pv[j]; 1675 rtmp2[col] -= mul2 * pv[j]; 1676 rtmp3[col] -= mul3 * pv[j]; 1677 rtmp4[col] -= mul4 * pv[j]; 1678 } 1679 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1680 } 1681 row = *bjtmp++; 1682 } 1683 1684 /* finished row i; check zero pivot, then stick row i into b->a */ 1685 rs = 0.0; 1686 /* L part */ 1687 pc1 = b->a + bi[i]; 1688 pj = b->j + bi[i]; 1689 nz = bi[i+1] - bi[i]; 1690 for (j=0; j<nz; j++) { 1691 col = pj[j]; 1692 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1693 } 1694 /* U part */ 1695 pc1 = b->a + bdiag[i+1]+1; 1696 pj = b->j + bdiag[i+1]+1; 1697 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1698 for (j=0; j<nz; j++) { 1699 col = pj[j]; 1700 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1701 } 1702 1703 sctx.rs = rs; 1704 sctx.pv = rtmp1[i]; 1705 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1706 if (sctx.newshift) break; 1707 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1708 *pc1 = 1.0/sctx.pv; 1709 1710 /* Now take care of 1st column of diagonal 4x4 block. */ 1711 pc2 = rtmp2 + i; 1712 pc3 = rtmp3 + i; 1713 pc4 = rtmp4 + i; 1714 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1715 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1716 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1717 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1718 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1719 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1720 for (j=0; j<nz; j++) { 1721 col = pj[j]; 1722 rtmp2[col] -= mul2 * rtmp1[col]; 1723 rtmp3[col] -= mul3 * rtmp1[col]; 1724 rtmp4[col] -= mul4 * rtmp1[col]; 1725 } 1726 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1727 } 1728 1729 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1730 rs = 0.0; 1731 /* L part */ 1732 pc2 = b->a + bi[i+1]; 1733 pj = b->j + bi[i+1]; 1734 nz = bi[i+2] - bi[i+1]; 1735 for (j=0; j<nz; j++) { 1736 col = pj[j]; 1737 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1738 } 1739 /* U part */ 1740 pc2 = b->a + bdiag[i+2]+1; 1741 pj = b->j + bdiag[i+2]+1; 1742 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1746 } 1747 1748 sctx.rs = rs; 1749 sctx.pv = rtmp2[i+1]; 1750 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1751 if (sctx.newshift) break; 1752 pc2 = b->a + bdiag[i+1]; 1753 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1754 1755 /* Now take care of 2nd column of diagonal 4x4 block. */ 1756 pc3 = rtmp3 + i+1; 1757 pc4 = rtmp4 + i+1; 1758 if (*pc3 != 0.0 || *pc4 != 0.0) { 1759 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1760 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1761 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1762 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1763 for (j=0; j<nz; j++) { 1764 col = pj[j]; 1765 rtmp3[col] -= mul3 * rtmp2[col]; 1766 rtmp4[col] -= mul4 * rtmp2[col]; 1767 } 1768 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1769 } 1770 1771 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1772 rs = 0.0; 1773 /* L part */ 1774 pc3 = b->a + bi[i+2]; 1775 pj = b->j + bi[i+2]; 1776 nz = bi[i+3] - bi[i+2]; 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1780 } 1781 /* U part */ 1782 pc3 = b->a + bdiag[i+3]+1; 1783 pj = b->j + bdiag[i+3]+1; 1784 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1785 for (j=0; j<nz; j++) { 1786 col = pj[j]; 1787 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1788 } 1789 1790 sctx.rs = rs; 1791 sctx.pv = rtmp3[i+2]; 1792 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1793 if (sctx.newshift) break; 1794 pc3 = b->a + bdiag[i+2]; 1795 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1796 1797 /* Now take care of 3rd column of diagonal 4x4 block. */ 1798 pc4 = rtmp4 + i+2; 1799 if (*pc4 != 0.0) { 1800 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1801 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1802 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1803 for (j=0; j<nz; j++) { 1804 col = pj[j]; 1805 rtmp4[col] -= mul4 * rtmp3[col]; 1806 } 1807 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1808 } 1809 1810 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1811 rs = 0.0; 1812 /* L part */ 1813 pc4 = b->a + bi[i+3]; 1814 pj = b->j + bi[i+3]; 1815 nz = bi[i+4] - bi[i+3]; 1816 for (j=0; j<nz; j++) { 1817 col = pj[j]; 1818 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1819 } 1820 /* U part */ 1821 pc4 = b->a + bdiag[i+4]+1; 1822 pj = b->j + bdiag[i+4]+1; 1823 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1824 for (j=0; j<nz; j++) { 1825 col = pj[j]; 1826 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1827 } 1828 1829 sctx.rs = rs; 1830 sctx.pv = rtmp4[i+3]; 1831 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1832 if (sctx.newshift) break; 1833 pc4 = b->a + bdiag[i+3]; 1834 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1835 break; 1836 1837 default: 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1839 } 1840 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1841 i += nodesz; /* Update the row */ 1842 } 1843 1844 /* MatPivotRefine() */ 1845 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1846 /* 1847 * if no shift in this attempt & shifting & started shifting & can refine, 1848 * then try lower shift 1849 */ 1850 sctx.shift_hi = sctx.shift_fraction; 1851 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1852 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1853 sctx.newshift = PETSC_TRUE; 1854 sctx.nshift++; 1855 } 1856 } while (sctx.newshift); 1857 1858 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1859 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1860 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1861 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1862 1863 if (b->inode.size) { 1864 C->ops->solve = MatSolve_SeqAIJ_Inode; 1865 } else { 1866 C->ops->solve = MatSolve_SeqAIJ; 1867 } 1868 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1869 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1870 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1871 C->ops->matsolve = MatMatSolve_SeqAIJ; 1872 C->assembled = PETSC_TRUE; 1873 C->preallocated = PETSC_TRUE; 1874 1875 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1876 1877 /* MatShiftView(A,info,&sctx) */ 1878 if (sctx.nshift) { 1879 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1880 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1881 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1882 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1884 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1885 } 1886 } 1887 PetscFunctionReturn(0); 1888 } 1889 1890 #undef __FUNCT__ 1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1893 { 1894 Mat C = B; 1895 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1896 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1897 PetscErrorCode ierr; 1898 const PetscInt *r,*ic,*c,*ics; 1899 PetscInt n = A->rmap->n,*bi = b->i; 1900 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1901 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1902 PetscInt *ai = a->i,*aj = a->j; 1903 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1904 PetscScalar mul1,mul2,mul3,tmp; 1905 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1906 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1907 PetscReal rs=0.0; 1908 FactorShiftCtx sctx; 1909 1910 PetscFunctionBegin; 1911 sctx.shift_top = 0; 1912 sctx.nshift_max = 0; 1913 sctx.shift_lo = 0; 1914 sctx.shift_hi = 0; 1915 sctx.shift_fraction = 0; 1916 1917 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1918 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1919 sctx.shift_top = 0; 1920 for (i=0; i<n; i++) { 1921 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1922 rs = 0.0; 1923 ajtmp = aj + ai[i]; 1924 rtmp1 = aa + ai[i]; 1925 nz = ai[i+1] - ai[i]; 1926 for (j=0; j<nz; j++) { 1927 if (*ajtmp != i) { 1928 rs += PetscAbsScalar(*rtmp1++); 1929 } else { 1930 rs -= PetscRealPart(*rtmp1++); 1931 } 1932 ajtmp++; 1933 } 1934 if (rs>sctx.shift_top) sctx.shift_top = rs; 1935 } 1936 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1937 sctx.shift_top *= 1.1; 1938 sctx.nshift_max = 5; 1939 sctx.shift_lo = 0.; 1940 sctx.shift_hi = 1.; 1941 } 1942 sctx.shift_amount = 0; 1943 sctx.nshift = 0; 1944 1945 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1946 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1947 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1948 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1949 ics = ic; 1950 1951 node_max = a->inode.node_count; 1952 ns = a->inode.size; 1953 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1954 1955 /* If max inode size > 3, split it into two inodes.*/ 1956 /* also map the inode sizes according to the ordering */ 1957 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1958 for (i=0,j=0; i<node_max; ++i,++j) { 1959 if (ns[i]>3) { 1960 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1961 ++j; 1962 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1963 } else { 1964 tmp_vec1[j] = ns[i]; 1965 } 1966 } 1967 /* Use the correct node_max */ 1968 node_max = j; 1969 1970 /* Now reorder the inode info based on mat re-ordering info */ 1971 /* First create a row -> inode_size_array_index map */ 1972 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1973 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1974 for (i=0,row=0; i<node_max; i++) { 1975 nodesz = tmp_vec1[i]; 1976 for (j=0; j<nodesz; j++,row++) { 1977 nsmap[row] = i; 1978 } 1979 } 1980 /* Using nsmap, create a reordered ns structure */ 1981 for (i=0,j=0; i< node_max; i++) { 1982 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1983 tmp_vec2[i] = nodesz; 1984 j += nodesz; 1985 } 1986 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1987 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1988 /* Now use the correct ns */ 1989 ns = tmp_vec2; 1990 1991 do { 1992 sctx.newshift = PETSC_FALSE; 1993 /* Now loop over each block-row, and do the factorization */ 1994 for (i=0,row=0; i<node_max; i++) { 1995 nodesz = ns[i]; 1996 nz = bi[row+1] - bi[row]; 1997 bjtmp = bj + bi[row]; 1998 1999 switch (nodesz) { 2000 case 1: 2001 for (j=0; j<nz; j++) { 2002 idx = bjtmp[j]; 2003 rtmp11[idx] = 0.0; 2004 } 2005 2006 /* load in initial (unfactored row) */ 2007 idx = r[row]; 2008 nz_tmp = ai[idx+1] - ai[idx]; 2009 ajtmp = aj + ai[idx]; 2010 v1 = aa + ai[idx]; 2011 2012 for (j=0; j<nz_tmp; j++) { 2013 idx = ics[ajtmp[j]]; 2014 rtmp11[idx] = v1[j]; 2015 } 2016 rtmp11[ics[r[row]]] += sctx.shift_amount; 2017 2018 prow = *bjtmp++; 2019 while (prow < row) { 2020 pc1 = rtmp11 + prow; 2021 if (*pc1 != 0.0) { 2022 pv = ba + bd[prow]; 2023 pj = nbj + bd[prow]; 2024 mul1 = *pc1 * *pv++; 2025 *pc1 = mul1; 2026 nz_tmp = bi[prow+1] - bd[prow] - 1; 2027 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2028 for (j=0; j<nz_tmp; j++) { 2029 tmp = pv[j]; 2030 idx = pj[j]; 2031 rtmp11[idx] -= mul1 * tmp; 2032 } 2033 } 2034 prow = *bjtmp++; 2035 } 2036 pj = bj + bi[row]; 2037 pc1 = ba + bi[row]; 2038 2039 sctx.pv = rtmp11[row]; 2040 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2041 rs = 0.0; 2042 for (j=0; j<nz; j++) { 2043 idx = pj[j]; 2044 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2045 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2046 } 2047 sctx.rs = rs; 2048 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2049 if (sctx.newshift) goto endofwhile; 2050 break; 2051 2052 case 2: 2053 for (j=0; j<nz; j++) { 2054 idx = bjtmp[j]; 2055 rtmp11[idx] = 0.0; 2056 rtmp22[idx] = 0.0; 2057 } 2058 2059 /* load in initial (unfactored row) */ 2060 idx = r[row]; 2061 nz_tmp = ai[idx+1] - ai[idx]; 2062 ajtmp = aj + ai[idx]; 2063 v1 = aa + ai[idx]; 2064 v2 = aa + ai[idx+1]; 2065 for (j=0; j<nz_tmp; j++) { 2066 idx = ics[ajtmp[j]]; 2067 rtmp11[idx] = v1[j]; 2068 rtmp22[idx] = v2[j]; 2069 } 2070 rtmp11[ics[r[row]]] += sctx.shift_amount; 2071 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2072 2073 prow = *bjtmp++; 2074 while (prow < row) { 2075 pc1 = rtmp11 + prow; 2076 pc2 = rtmp22 + prow; 2077 if (*pc1 != 0.0 || *pc2 != 0.0) { 2078 pv = ba + bd[prow]; 2079 pj = nbj + bd[prow]; 2080 mul1 = *pc1 * *pv; 2081 mul2 = *pc2 * *pv; 2082 ++pv; 2083 *pc1 = mul1; 2084 *pc2 = mul2; 2085 2086 nz_tmp = bi[prow+1] - bd[prow] - 1; 2087 for (j=0; j<nz_tmp; j++) { 2088 tmp = pv[j]; 2089 idx = pj[j]; 2090 rtmp11[idx] -= mul1 * tmp; 2091 rtmp22[idx] -= mul2 * tmp; 2092 } 2093 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2094 } 2095 prow = *bjtmp++; 2096 } 2097 2098 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2099 pc1 = rtmp11 + prow; 2100 pc2 = rtmp22 + prow; 2101 2102 sctx.pv = *pc1; 2103 pj = bj + bi[prow]; 2104 rs = 0.0; 2105 for (j=0; j<nz; j++) { 2106 idx = pj[j]; 2107 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2108 } 2109 sctx.rs = rs; 2110 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2111 if (sctx.newshift) goto endofwhile; 2112 2113 if (*pc2 != 0.0) { 2114 pj = nbj + bd[prow]; 2115 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2116 *pc2 = mul2; 2117 nz_tmp = bi[prow+1] - bd[prow] - 1; 2118 for (j=0; j<nz_tmp; j++) { 2119 idx = pj[j]; 2120 tmp = rtmp11[idx]; 2121 rtmp22[idx] -= mul2 * tmp; 2122 } 2123 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2124 } 2125 2126 pj = bj + bi[row]; 2127 pc1 = ba + bi[row]; 2128 pc2 = ba + bi[row+1]; 2129 2130 sctx.pv = rtmp22[row+1]; 2131 rs = 0.0; 2132 rtmp11[row] = 1.0/rtmp11[row]; 2133 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2134 /* copy row entries from dense representation to sparse */ 2135 for (j=0; j<nz; j++) { 2136 idx = pj[j]; 2137 pc1[j] = rtmp11[idx]; 2138 pc2[j] = rtmp22[idx]; 2139 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2140 } 2141 sctx.rs = rs; 2142 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2143 if (sctx.newshift) goto endofwhile; 2144 break; 2145 2146 case 3: 2147 for (j=0; j<nz; j++) { 2148 idx = bjtmp[j]; 2149 rtmp11[idx] = 0.0; 2150 rtmp22[idx] = 0.0; 2151 rtmp33[idx] = 0.0; 2152 } 2153 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2154 idx = r[row]; 2155 nz_tmp = ai[idx+1] - ai[idx]; 2156 ajtmp = aj + ai[idx]; 2157 v1 = aa + ai[idx]; 2158 v2 = aa + ai[idx+1]; 2159 v3 = aa + ai[idx+2]; 2160 for (j=0; j<nz_tmp; j++) { 2161 idx = ics[ajtmp[j]]; 2162 rtmp11[idx] = v1[j]; 2163 rtmp22[idx] = v2[j]; 2164 rtmp33[idx] = v3[j]; 2165 } 2166 rtmp11[ics[r[row]]] += sctx.shift_amount; 2167 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2168 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2169 2170 /* loop over all pivot row blocks above this row block */ 2171 prow = *bjtmp++; 2172 while (prow < row) { 2173 pc1 = rtmp11 + prow; 2174 pc2 = rtmp22 + prow; 2175 pc3 = rtmp33 + prow; 2176 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2177 pv = ba + bd[prow]; 2178 pj = nbj + bd[prow]; 2179 mul1 = *pc1 * *pv; 2180 mul2 = *pc2 * *pv; 2181 mul3 = *pc3 * *pv; 2182 ++pv; 2183 *pc1 = mul1; 2184 *pc2 = mul2; 2185 *pc3 = mul3; 2186 2187 nz_tmp = bi[prow+1] - bd[prow] - 1; 2188 /* update this row based on pivot row */ 2189 for (j=0; j<nz_tmp; j++) { 2190 tmp = pv[j]; 2191 idx = pj[j]; 2192 rtmp11[idx] -= mul1 * tmp; 2193 rtmp22[idx] -= mul2 * tmp; 2194 rtmp33[idx] -= mul3 * tmp; 2195 } 2196 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2197 } 2198 prow = *bjtmp++; 2199 } 2200 2201 /* Now take care of diagonal 3x3 block in this set of rows */ 2202 /* note: prow = row here */ 2203 pc1 = rtmp11 + prow; 2204 pc2 = rtmp22 + prow; 2205 pc3 = rtmp33 + prow; 2206 2207 sctx.pv = *pc1; 2208 pj = bj + bi[prow]; 2209 rs = 0.0; 2210 for (j=0; j<nz; j++) { 2211 idx = pj[j]; 2212 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2213 } 2214 sctx.rs = rs; 2215 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2216 if (sctx.newshift) goto endofwhile; 2217 2218 if (*pc2 != 0.0 || *pc3 != 0.0) { 2219 mul2 = (*pc2)/(*pc1); 2220 mul3 = (*pc3)/(*pc1); 2221 *pc2 = mul2; 2222 *pc3 = mul3; 2223 nz_tmp = bi[prow+1] - bd[prow] - 1; 2224 pj = nbj + bd[prow]; 2225 for (j=0; j<nz_tmp; j++) { 2226 idx = pj[j]; 2227 tmp = rtmp11[idx]; 2228 rtmp22[idx] -= mul2 * tmp; 2229 rtmp33[idx] -= mul3 * tmp; 2230 } 2231 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2232 } 2233 ++prow; 2234 2235 pc2 = rtmp22 + prow; 2236 pc3 = rtmp33 + prow; 2237 sctx.pv = *pc2; 2238 pj = bj + bi[prow]; 2239 rs = 0.0; 2240 for (j=0; j<nz; j++) { 2241 idx = pj[j]; 2242 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2243 } 2244 sctx.rs = rs; 2245 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2246 if (sctx.newshift) goto endofwhile; 2247 2248 if (*pc3 != 0.0) { 2249 mul3 = (*pc3)/(*pc2); 2250 *pc3 = mul3; 2251 pj = nbj + bd[prow]; 2252 nz_tmp = bi[prow+1] - bd[prow] - 1; 2253 for (j=0; j<nz_tmp; j++) { 2254 idx = pj[j]; 2255 tmp = rtmp22[idx]; 2256 rtmp33[idx] -= mul3 * tmp; 2257 } 2258 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2259 } 2260 2261 pj = bj + bi[row]; 2262 pc1 = ba + bi[row]; 2263 pc2 = ba + bi[row+1]; 2264 pc3 = ba + bi[row+2]; 2265 2266 sctx.pv = rtmp33[row+2]; 2267 rs = 0.0; 2268 rtmp11[row] = 1.0/rtmp11[row]; 2269 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2270 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2271 /* copy row entries from dense representation to sparse */ 2272 for (j=0; j<nz; j++) { 2273 idx = pj[j]; 2274 pc1[j] = rtmp11[idx]; 2275 pc2[j] = rtmp22[idx]; 2276 pc3[j] = rtmp33[idx]; 2277 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2278 } 2279 2280 sctx.rs = rs; 2281 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2282 if (sctx.newshift) goto endofwhile; 2283 break; 2284 2285 default: 2286 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2287 } 2288 row += nodesz; /* Update the row */ 2289 } 2290 endofwhile:; 2291 } while (sctx.newshift); 2292 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2293 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2294 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2295 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2296 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2297 2298 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2299 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2300 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2301 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2302 C->assembled = PETSC_TRUE; 2303 C->preallocated = PETSC_TRUE; 2304 if (sctx.nshift) { 2305 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2306 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2307 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2308 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2309 } 2310 } 2311 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2312 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2313 PetscFunctionReturn(0); 2314 } 2315 2316 2317 /* ----------------------------------------------------------- */ 2318 #undef __FUNCT__ 2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2321 { 2322 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2323 IS iscol = a->col,isrow = a->row; 2324 PetscErrorCode ierr; 2325 const PetscInt *r,*c,*rout,*cout; 2326 PetscInt i,j,n = A->rmap->n; 2327 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2328 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2329 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2330 PetscScalar sum1,sum2,sum3,sum4,sum5; 2331 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2332 const PetscScalar *b; 2333 2334 PetscFunctionBegin; 2335 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2336 node_max = a->inode.node_count; 2337 ns = a->inode.size; /* Node Size array */ 2338 2339 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2340 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2341 tmp = a->solve_work; 2342 2343 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2344 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2345 2346 /* forward solve the lower triangular */ 2347 tmps = tmp; 2348 aa = a_a; 2349 aj = a_j; 2350 ad = a->diag; 2351 2352 for (i = 0,row = 0; i< node_max; ++i) { 2353 nsz = ns[i]; 2354 aii = ai[row]; 2355 v1 = aa + aii; 2356 vi = aj + aii; 2357 nz = ai[row+1]- ai[row]; 2358 2359 if (i < node_max-1) { 2360 /* Prefetch the indices for the next block */ 2361 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2362 /* Prefetch the data for the next block */ 2363 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2364 } 2365 2366 switch (nsz) { /* Each loop in 'case' is unrolled */ 2367 case 1: 2368 sum1 = b[r[row]]; 2369 for (j=0; j<nz-1; j+=2) { 2370 i0 = vi[j]; 2371 i1 = vi[j+1]; 2372 tmp0 = tmps[i0]; 2373 tmp1 = tmps[i1]; 2374 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2375 } 2376 if (j == nz-1) { 2377 tmp0 = tmps[vi[j]]; 2378 sum1 -= v1[j]*tmp0; 2379 } 2380 tmp[row++]=sum1; 2381 break; 2382 case 2: 2383 sum1 = b[r[row]]; 2384 sum2 = b[r[row+1]]; 2385 v2 = aa + ai[row+1]; 2386 2387 for (j=0; j<nz-1; j+=2) { 2388 i0 = vi[j]; 2389 i1 = vi[j+1]; 2390 tmp0 = tmps[i0]; 2391 tmp1 = tmps[i1]; 2392 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2393 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2394 } 2395 if (j == nz-1) { 2396 tmp0 = tmps[vi[j]]; 2397 sum1 -= v1[j] *tmp0; 2398 sum2 -= v2[j] *tmp0; 2399 } 2400 sum2 -= v2[nz] * sum1; 2401 tmp[row++]=sum1; 2402 tmp[row++]=sum2; 2403 break; 2404 case 3: 2405 sum1 = b[r[row]]; 2406 sum2 = b[r[row+1]]; 2407 sum3 = b[r[row+2]]; 2408 v2 = aa + ai[row+1]; 2409 v3 = aa + ai[row+2]; 2410 2411 for (j=0; j<nz-1; j+=2) { 2412 i0 = vi[j]; 2413 i1 = vi[j+1]; 2414 tmp0 = tmps[i0]; 2415 tmp1 = tmps[i1]; 2416 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2417 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2418 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2419 } 2420 if (j == nz-1) { 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] *tmp0; 2423 sum2 -= v2[j] *tmp0; 2424 sum3 -= v3[j] *tmp0; 2425 } 2426 sum2 -= v2[nz] * sum1; 2427 sum3 -= v3[nz] * sum1; 2428 sum3 -= v3[nz+1] * sum2; 2429 tmp[row++]=sum1; 2430 tmp[row++]=sum2; 2431 tmp[row++]=sum3; 2432 break; 2433 2434 case 4: 2435 sum1 = b[r[row]]; 2436 sum2 = b[r[row+1]]; 2437 sum3 = b[r[row+2]]; 2438 sum4 = b[r[row+3]]; 2439 v2 = aa + ai[row+1]; 2440 v3 = aa + ai[row+2]; 2441 v4 = aa + ai[row+3]; 2442 2443 for (j=0; j<nz-1; j+=2) { 2444 i0 = vi[j]; 2445 i1 = vi[j+1]; 2446 tmp0 = tmps[i0]; 2447 tmp1 = tmps[i1]; 2448 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2449 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2450 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2451 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2452 } 2453 if (j == nz-1) { 2454 tmp0 = tmps[vi[j]]; 2455 sum1 -= v1[j] *tmp0; 2456 sum2 -= v2[j] *tmp0; 2457 sum3 -= v3[j] *tmp0; 2458 sum4 -= v4[j] *tmp0; 2459 } 2460 sum2 -= v2[nz] * sum1; 2461 sum3 -= v3[nz] * sum1; 2462 sum4 -= v4[nz] * sum1; 2463 sum3 -= v3[nz+1] * sum2; 2464 sum4 -= v4[nz+1] * sum2; 2465 sum4 -= v4[nz+2] * sum3; 2466 2467 tmp[row++]=sum1; 2468 tmp[row++]=sum2; 2469 tmp[row++]=sum3; 2470 tmp[row++]=sum4; 2471 break; 2472 case 5: 2473 sum1 = b[r[row]]; 2474 sum2 = b[r[row+1]]; 2475 sum3 = b[r[row+2]]; 2476 sum4 = b[r[row+3]]; 2477 sum5 = b[r[row+4]]; 2478 v2 = aa + ai[row+1]; 2479 v3 = aa + ai[row+2]; 2480 v4 = aa + ai[row+3]; 2481 v5 = aa + ai[row+4]; 2482 2483 for (j=0; j<nz-1; j+=2) { 2484 i0 = vi[j]; 2485 i1 = vi[j+1]; 2486 tmp0 = tmps[i0]; 2487 tmp1 = tmps[i1]; 2488 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2489 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2490 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2491 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2492 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2493 } 2494 if (j == nz-1) { 2495 tmp0 = tmps[vi[j]]; 2496 sum1 -= v1[j] *tmp0; 2497 sum2 -= v2[j] *tmp0; 2498 sum3 -= v3[j] *tmp0; 2499 sum4 -= v4[j] *tmp0; 2500 sum5 -= v5[j] *tmp0; 2501 } 2502 2503 sum2 -= v2[nz] * sum1; 2504 sum3 -= v3[nz] * sum1; 2505 sum4 -= v4[nz] * sum1; 2506 sum5 -= v5[nz] * sum1; 2507 sum3 -= v3[nz+1] * sum2; 2508 sum4 -= v4[nz+1] * sum2; 2509 sum5 -= v5[nz+1] * sum2; 2510 sum4 -= v4[nz+2] * sum3; 2511 sum5 -= v5[nz+2] * sum3; 2512 sum5 -= v5[nz+3] * sum4; 2513 2514 tmp[row++]=sum1; 2515 tmp[row++]=sum2; 2516 tmp[row++]=sum3; 2517 tmp[row++]=sum4; 2518 tmp[row++]=sum5; 2519 break; 2520 default: 2521 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2522 } 2523 } 2524 /* backward solve the upper triangular */ 2525 for (i=node_max -1,row = n-1; i>=0; i--) { 2526 nsz = ns[i]; 2527 aii = ad[row+1] + 1; 2528 v1 = aa + aii; 2529 vi = aj + aii; 2530 nz = ad[row]- ad[row+1] - 1; 2531 2532 if (i > 0) { 2533 /* Prefetch the indices for the next block */ 2534 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2535 /* Prefetch the data for the next block */ 2536 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2537 } 2538 2539 switch (nsz) { /* Each loop in 'case' is unrolled */ 2540 case 1: 2541 sum1 = tmp[row]; 2542 2543 for (j=0; j<nz-1; j+=2) { 2544 i0 = vi[j]; 2545 i1 = vi[j+1]; 2546 tmp0 = tmps[i0]; 2547 tmp1 = tmps[i1]; 2548 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2549 } 2550 if (j == nz-1) { 2551 tmp0 = tmps[vi[j]]; 2552 sum1 -= v1[j]*tmp0; 2553 } 2554 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2555 break; 2556 case 2: 2557 sum1 = tmp[row]; 2558 sum2 = tmp[row-1]; 2559 v2 = aa + ad[row] + 1; 2560 for (j=0; j<nz-1; j+=2) { 2561 i0 = vi[j]; 2562 i1 = vi[j+1]; 2563 tmp0 = tmps[i0]; 2564 tmp1 = tmps[i1]; 2565 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2566 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2567 } 2568 if (j == nz-1) { 2569 tmp0 = tmps[vi[j]]; 2570 sum1 -= v1[j]* tmp0; 2571 sum2 -= v2[j+1]* tmp0; 2572 } 2573 2574 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2575 sum2 -= v2[0] * tmp0; 2576 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2577 break; 2578 case 3: 2579 sum1 = tmp[row]; 2580 sum2 = tmp[row -1]; 2581 sum3 = tmp[row -2]; 2582 v2 = aa + ad[row] + 1; 2583 v3 = aa + ad[row -1] + 1; 2584 for (j=0; j<nz-1; j+=2) { 2585 i0 = vi[j]; 2586 i1 = vi[j+1]; 2587 tmp0 = tmps[i0]; 2588 tmp1 = tmps[i1]; 2589 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2590 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2591 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2592 } 2593 if (j== nz-1) { 2594 tmp0 = tmps[vi[j]]; 2595 sum1 -= v1[j] * tmp0; 2596 sum2 -= v2[j+1] * tmp0; 2597 sum3 -= v3[j+2] * tmp0; 2598 } 2599 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2600 sum2 -= v2[0]* tmp0; 2601 sum3 -= v3[1] * tmp0; 2602 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2603 sum3 -= v3[0]* tmp0; 2604 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2605 2606 break; 2607 case 4: 2608 sum1 = tmp[row]; 2609 sum2 = tmp[row -1]; 2610 sum3 = tmp[row -2]; 2611 sum4 = tmp[row -3]; 2612 v2 = aa + ad[row]+1; 2613 v3 = aa + ad[row -1]+1; 2614 v4 = aa + ad[row -2]+1; 2615 2616 for (j=0; j<nz-1; j+=2) { 2617 i0 = vi[j]; 2618 i1 = vi[j+1]; 2619 tmp0 = tmps[i0]; 2620 tmp1 = tmps[i1]; 2621 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2622 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2623 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2624 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2625 } 2626 if (j== nz-1) { 2627 tmp0 = tmps[vi[j]]; 2628 sum1 -= v1[j] * tmp0; 2629 sum2 -= v2[j+1] * tmp0; 2630 sum3 -= v3[j+2] * tmp0; 2631 sum4 -= v4[j+3] * tmp0; 2632 } 2633 2634 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2635 sum2 -= v2[0] * tmp0; 2636 sum3 -= v3[1] * tmp0; 2637 sum4 -= v4[2] * tmp0; 2638 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2639 sum3 -= v3[0] * tmp0; 2640 sum4 -= v4[1] * tmp0; 2641 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2642 sum4 -= v4[0] * tmp0; 2643 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2644 break; 2645 case 5: 2646 sum1 = tmp[row]; 2647 sum2 = tmp[row -1]; 2648 sum3 = tmp[row -2]; 2649 sum4 = tmp[row -3]; 2650 sum5 = tmp[row -4]; 2651 v2 = aa + ad[row]+1; 2652 v3 = aa + ad[row -1]+1; 2653 v4 = aa + ad[row -2]+1; 2654 v5 = aa + ad[row -3]+1; 2655 for (j=0; j<nz-1; j+=2) { 2656 i0 = vi[j]; 2657 i1 = vi[j+1]; 2658 tmp0 = tmps[i0]; 2659 tmp1 = tmps[i1]; 2660 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2661 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2662 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2663 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2664 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2665 } 2666 if (j==nz-1) { 2667 tmp0 = tmps[vi[j]]; 2668 sum1 -= v1[j] * tmp0; 2669 sum2 -= v2[j+1] * tmp0; 2670 sum3 -= v3[j+2] * tmp0; 2671 sum4 -= v4[j+3] * tmp0; 2672 sum5 -= v5[j+4] * tmp0; 2673 } 2674 2675 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2676 sum2 -= v2[0] * tmp0; 2677 sum3 -= v3[1] * tmp0; 2678 sum4 -= v4[2] * tmp0; 2679 sum5 -= v5[3] * tmp0; 2680 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2681 sum3 -= v3[0] * tmp0; 2682 sum4 -= v4[1] * tmp0; 2683 sum5 -= v5[2] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2685 sum4 -= v4[0] * tmp0; 2686 sum5 -= v5[1] * tmp0; 2687 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2688 sum5 -= v5[0] * tmp0; 2689 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2690 break; 2691 default: 2692 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2693 } 2694 } 2695 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2696 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2699 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2700 PetscFunctionReturn(0); 2701 } 2702 2703 2704 /* 2705 Makes a longer coloring[] array and calls the usual code with that 2706 */ 2707 #undef __FUNCT__ 2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2710 { 2711 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2712 PetscErrorCode ierr; 2713 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2714 PetscInt *colorused,i; 2715 ISColoringValue *newcolor; 2716 2717 PetscFunctionBegin; 2718 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2719 /* loop over inodes, marking a color for each column*/ 2720 row = 0; 2721 for (i=0; i<m; i++) { 2722 for (j=0; j<ns[i]; j++) { 2723 newcolor[row++] = coloring[i] + j*ncolors; 2724 } 2725 } 2726 2727 /* eliminate unneeded colors */ 2728 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2729 for (i=0; i<n; i++) { 2730 colorused[newcolor[i]] = 1; 2731 } 2732 2733 for (i=1; i<5*ncolors; i++) { 2734 colorused[i] += colorused[i-1]; 2735 } 2736 ncolors = colorused[5*ncolors-1]; 2737 for (i=0; i<n; i++) { 2738 newcolor[i] = colorused[newcolor[i]]-1; 2739 } 2740 ierr = PetscFree(colorused);CHKERRQ(ierr); 2741 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2742 ierr = PetscFree(coloring);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 #include <petsc/private/kernels/blockinvert.h> 2747 2748 #undef __FUNCT__ 2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2751 { 2752 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2753 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2754 MatScalar *ibdiag,*bdiag,work[25],*t; 2755 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2756 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2757 const PetscScalar *xb, *b; 2758 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2759 PetscErrorCode ierr; 2760 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2761 PetscInt sz,k,ipvt[5]; 2762 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 2763 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2764 2765 PetscFunctionBegin; 2766 allowzeropivot = PetscNot(A->erroriffailure); 2767 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2768 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 2769 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2770 2771 if (!a->inode.ibdiagvalid) { 2772 if (!a->inode.ibdiag) { 2773 /* calculate space needed for diagonal blocks */ 2774 for (i=0; i<m; i++) { 2775 cnt += sizes[i]*sizes[i]; 2776 } 2777 a->inode.bdiagsize = cnt; 2778 2779 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2780 } 2781 2782 /* copy over the diagonal blocks and invert them */ 2783 ibdiag = a->inode.ibdiag; 2784 bdiag = a->inode.bdiag; 2785 cnt = 0; 2786 for (i=0, row = 0; i<m; i++) { 2787 for (j=0; j<sizes[i]; j++) { 2788 for (k=0; k<sizes[i]; k++) { 2789 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2790 } 2791 } 2792 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2793 2794 switch (sizes[i]) { 2795 case 1: 2796 /* Create matrix data structure */ 2797 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2798 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2799 break; 2800 case 2: 2801 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2802 break; 2803 case 3: 2804 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2805 break; 2806 case 4: 2807 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2808 break; 2809 case 5: 2810 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2811 break; 2812 default: 2813 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2814 } 2815 if (zeropivotdetected) { 2816 A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2817 } 2818 2819 cnt += sizes[i]*sizes[i]; 2820 row += sizes[i]; 2821 } 2822 a->inode.ibdiagvalid = PETSC_TRUE; 2823 } 2824 ibdiag = a->inode.ibdiag; 2825 bdiag = a->inode.bdiag; 2826 t = a->inode.ssor_work; 2827 2828 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2829 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2830 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2831 if (flag & SOR_ZERO_INITIAL_GUESS) { 2832 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2833 2834 for (i=0, row=0; i<m; i++) { 2835 sz = diag[row] - ii[row]; 2836 v1 = a->a + ii[row]; 2837 idx = a->j + ii[row]; 2838 2839 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2840 switch (sizes[i]) { 2841 case 1: 2842 2843 sum1 = b[row]; 2844 for (n = 0; n<sz-1; n+=2) { 2845 i1 = idx[0]; 2846 i2 = idx[1]; 2847 idx += 2; 2848 tmp0 = x[i1]; 2849 tmp1 = x[i2]; 2850 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2851 } 2852 2853 if (n == sz-1) { 2854 tmp0 = x[*idx]; 2855 sum1 -= *v1 * tmp0; 2856 } 2857 t[row] = sum1; 2858 x[row++] = sum1*(*ibdiag++); 2859 break; 2860 case 2: 2861 v2 = a->a + ii[row+1]; 2862 sum1 = b[row]; 2863 sum2 = b[row+1]; 2864 for (n = 0; n<sz-1; n+=2) { 2865 i1 = idx[0]; 2866 i2 = idx[1]; 2867 idx += 2; 2868 tmp0 = x[i1]; 2869 tmp1 = x[i2]; 2870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2872 } 2873 2874 if (n == sz-1) { 2875 tmp0 = x[*idx]; 2876 sum1 -= v1[0] * tmp0; 2877 sum2 -= v2[0] * tmp0; 2878 } 2879 t[row] = sum1; 2880 t[row+1] = sum2; 2881 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2882 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2883 ibdiag += 4; 2884 break; 2885 case 3: 2886 v2 = a->a + ii[row+1]; 2887 v3 = a->a + ii[row+2]; 2888 sum1 = b[row]; 2889 sum2 = b[row+1]; 2890 sum3 = b[row+2]; 2891 for (n = 0; n<sz-1; n+=2) { 2892 i1 = idx[0]; 2893 i2 = idx[1]; 2894 idx += 2; 2895 tmp0 = x[i1]; 2896 tmp1 = x[i2]; 2897 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2898 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2899 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2900 } 2901 2902 if (n == sz-1) { 2903 tmp0 = x[*idx]; 2904 sum1 -= v1[0] * tmp0; 2905 sum2 -= v2[0] * tmp0; 2906 sum3 -= v3[0] * tmp0; 2907 } 2908 t[row] = sum1; 2909 t[row+1] = sum2; 2910 t[row+2] = sum3; 2911 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2912 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2913 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2914 ibdiag += 9; 2915 break; 2916 case 4: 2917 v2 = a->a + ii[row+1]; 2918 v3 = a->a + ii[row+2]; 2919 v4 = a->a + ii[row+3]; 2920 sum1 = b[row]; 2921 sum2 = b[row+1]; 2922 sum3 = b[row+2]; 2923 sum4 = b[row+3]; 2924 for (n = 0; n<sz-1; n+=2) { 2925 i1 = idx[0]; 2926 i2 = idx[1]; 2927 idx += 2; 2928 tmp0 = x[i1]; 2929 tmp1 = x[i2]; 2930 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2931 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2932 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2933 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2934 } 2935 2936 if (n == sz-1) { 2937 tmp0 = x[*idx]; 2938 sum1 -= v1[0] * tmp0; 2939 sum2 -= v2[0] * tmp0; 2940 sum3 -= v3[0] * tmp0; 2941 sum4 -= v4[0] * tmp0; 2942 } 2943 t[row] = sum1; 2944 t[row+1] = sum2; 2945 t[row+2] = sum3; 2946 t[row+3] = sum4; 2947 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2948 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2949 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2950 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2951 ibdiag += 16; 2952 break; 2953 case 5: 2954 v2 = a->a + ii[row+1]; 2955 v3 = a->a + ii[row+2]; 2956 v4 = a->a + ii[row+3]; 2957 v5 = a->a + ii[row+4]; 2958 sum1 = b[row]; 2959 sum2 = b[row+1]; 2960 sum3 = b[row+2]; 2961 sum4 = b[row+3]; 2962 sum5 = b[row+4]; 2963 for (n = 0; n<sz-1; n+=2) { 2964 i1 = idx[0]; 2965 i2 = idx[1]; 2966 idx += 2; 2967 tmp0 = x[i1]; 2968 tmp1 = x[i2]; 2969 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2970 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2971 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2972 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2973 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2974 } 2975 2976 if (n == sz-1) { 2977 tmp0 = x[*idx]; 2978 sum1 -= v1[0] * tmp0; 2979 sum2 -= v2[0] * tmp0; 2980 sum3 -= v3[0] * tmp0; 2981 sum4 -= v4[0] * tmp0; 2982 sum5 -= v5[0] * tmp0; 2983 } 2984 t[row] = sum1; 2985 t[row+1] = sum2; 2986 t[row+2] = sum3; 2987 t[row+3] = sum4; 2988 t[row+4] = sum5; 2989 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2990 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2991 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2992 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2993 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2994 ibdiag += 25; 2995 break; 2996 default: 2997 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2998 } 2999 } 3000 3001 xb = t; 3002 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3003 } else xb = b; 3004 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3005 3006 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3007 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3008 ibdiag -= sizes[i]*sizes[i]; 3009 sz = ii[row+1] - diag[row] - 1; 3010 v1 = a->a + diag[row] + 1; 3011 idx = a->j + diag[row] + 1; 3012 3013 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3014 switch (sizes[i]) { 3015 case 1: 3016 3017 sum1 = xb[row]; 3018 for (n = 0; n<sz-1; n+=2) { 3019 i1 = idx[0]; 3020 i2 = idx[1]; 3021 idx += 2; 3022 tmp0 = x[i1]; 3023 tmp1 = x[i2]; 3024 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3025 } 3026 3027 if (n == sz-1) { 3028 tmp0 = x[*idx]; 3029 sum1 -= *v1*tmp0; 3030 } 3031 x[row--] = sum1*(*ibdiag); 3032 break; 3033 3034 case 2: 3035 3036 sum1 = xb[row]; 3037 sum2 = xb[row-1]; 3038 /* note that sum1 is associated with the second of the two rows */ 3039 v2 = a->a + diag[row-1] + 2; 3040 for (n = 0; n<sz-1; n+=2) { 3041 i1 = idx[0]; 3042 i2 = idx[1]; 3043 idx += 2; 3044 tmp0 = x[i1]; 3045 tmp1 = x[i2]; 3046 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3047 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3048 } 3049 3050 if (n == sz-1) { 3051 tmp0 = x[*idx]; 3052 sum1 -= *v1*tmp0; 3053 sum2 -= *v2*tmp0; 3054 } 3055 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3056 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3057 break; 3058 case 3: 3059 3060 sum1 = xb[row]; 3061 sum2 = xb[row-1]; 3062 sum3 = xb[row-2]; 3063 v2 = a->a + diag[row-1] + 2; 3064 v3 = a->a + diag[row-2] + 3; 3065 for (n = 0; n<sz-1; n+=2) { 3066 i1 = idx[0]; 3067 i2 = idx[1]; 3068 idx += 2; 3069 tmp0 = x[i1]; 3070 tmp1 = x[i2]; 3071 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3072 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3073 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3074 } 3075 3076 if (n == sz-1) { 3077 tmp0 = x[*idx]; 3078 sum1 -= *v1*tmp0; 3079 sum2 -= *v2*tmp0; 3080 sum3 -= *v3*tmp0; 3081 } 3082 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3083 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3084 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3085 break; 3086 case 4: 3087 3088 sum1 = xb[row]; 3089 sum2 = xb[row-1]; 3090 sum3 = xb[row-2]; 3091 sum4 = xb[row-3]; 3092 v2 = a->a + diag[row-1] + 2; 3093 v3 = a->a + diag[row-2] + 3; 3094 v4 = a->a + diag[row-3] + 4; 3095 for (n = 0; n<sz-1; n+=2) { 3096 i1 = idx[0]; 3097 i2 = idx[1]; 3098 idx += 2; 3099 tmp0 = x[i1]; 3100 tmp1 = x[i2]; 3101 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3102 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3103 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3104 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3105 } 3106 3107 if (n == sz-1) { 3108 tmp0 = x[*idx]; 3109 sum1 -= *v1*tmp0; 3110 sum2 -= *v2*tmp0; 3111 sum3 -= *v3*tmp0; 3112 sum4 -= *v4*tmp0; 3113 } 3114 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3115 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3116 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3117 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3118 break; 3119 case 5: 3120 3121 sum1 = xb[row]; 3122 sum2 = xb[row-1]; 3123 sum3 = xb[row-2]; 3124 sum4 = xb[row-3]; 3125 sum5 = xb[row-4]; 3126 v2 = a->a + diag[row-1] + 2; 3127 v3 = a->a + diag[row-2] + 3; 3128 v4 = a->a + diag[row-3] + 4; 3129 v5 = a->a + diag[row-4] + 5; 3130 for (n = 0; n<sz-1; n+=2) { 3131 i1 = idx[0]; 3132 i2 = idx[1]; 3133 idx += 2; 3134 tmp0 = x[i1]; 3135 tmp1 = x[i2]; 3136 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3137 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3138 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3139 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3140 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3141 } 3142 3143 if (n == sz-1) { 3144 tmp0 = x[*idx]; 3145 sum1 -= *v1*tmp0; 3146 sum2 -= *v2*tmp0; 3147 sum3 -= *v3*tmp0; 3148 sum4 -= *v4*tmp0; 3149 sum5 -= *v5*tmp0; 3150 } 3151 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3152 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3153 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3154 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3155 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3156 break; 3157 default: 3158 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3159 } 3160 } 3161 3162 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3163 } 3164 its--; 3165 } 3166 while (its--) { 3167 3168 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3169 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3170 i<m; 3171 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3172 3173 sz = diag[row] - ii[row]; 3174 v1 = a->a + ii[row]; 3175 idx = a->j + ii[row]; 3176 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3177 switch (sizes[i]) { 3178 case 1: 3179 sum1 = b[row]; 3180 for (n = 0; n<sz-1; n+=2) { 3181 i1 = idx[0]; 3182 i2 = idx[1]; 3183 idx += 2; 3184 tmp0 = x[i1]; 3185 tmp1 = x[i2]; 3186 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3187 } 3188 if (n == sz-1) { 3189 tmp0 = x[*idx++]; 3190 sum1 -= *v1 * tmp0; 3191 v1++; 3192 } 3193 t[row] = sum1; 3194 sz = ii[row+1] - diag[row] - 1; 3195 idx = a->j + diag[row] + 1; 3196 v1 += 1; 3197 for (n = 0; n<sz-1; n+=2) { 3198 i1 = idx[0]; 3199 i2 = idx[1]; 3200 idx += 2; 3201 tmp0 = x[i1]; 3202 tmp1 = x[i2]; 3203 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3204 } 3205 if (n == sz-1) { 3206 tmp0 = x[*idx++]; 3207 sum1 -= *v1 * tmp0; 3208 } 3209 /* in MatSOR_SeqAIJ this line would be 3210 * 3211 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3212 * 3213 * but omega == 1, so this becomes 3214 * 3215 * x[row] = sum1*(*ibdiag++); 3216 * 3217 */ 3218 x[row] = sum1*(*ibdiag); 3219 break; 3220 case 2: 3221 v2 = a->a + ii[row+1]; 3222 sum1 = b[row]; 3223 sum2 = b[row+1]; 3224 for (n = 0; n<sz-1; n+=2) { 3225 i1 = idx[0]; 3226 i2 = idx[1]; 3227 idx += 2; 3228 tmp0 = x[i1]; 3229 tmp1 = x[i2]; 3230 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3231 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3232 } 3233 if (n == sz-1) { 3234 tmp0 = x[*idx++]; 3235 sum1 -= v1[0] * tmp0; 3236 sum2 -= v2[0] * tmp0; 3237 v1++; v2++; 3238 } 3239 t[row] = sum1; 3240 t[row+1] = sum2; 3241 sz = ii[row+1] - diag[row] - 2; 3242 idx = a->j + diag[row] + 2; 3243 v1 += 2; 3244 v2 += 2; 3245 for (n = 0; n<sz-1; n+=2) { 3246 i1 = idx[0]; 3247 i2 = idx[1]; 3248 idx += 2; 3249 tmp0 = x[i1]; 3250 tmp1 = x[i2]; 3251 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3252 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3253 } 3254 if (n == sz-1) { 3255 tmp0 = x[*idx]; 3256 sum1 -= v1[0] * tmp0; 3257 sum2 -= v2[0] * tmp0; 3258 } 3259 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3260 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3261 break; 3262 case 3: 3263 v2 = a->a + ii[row+1]; 3264 v3 = a->a + ii[row+2]; 3265 sum1 = b[row]; 3266 sum2 = b[row+1]; 3267 sum3 = b[row+2]; 3268 for (n = 0; n<sz-1; n+=2) { 3269 i1 = idx[0]; 3270 i2 = idx[1]; 3271 idx += 2; 3272 tmp0 = x[i1]; 3273 tmp1 = x[i2]; 3274 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3275 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3276 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3277 } 3278 if (n == sz-1) { 3279 tmp0 = x[*idx++]; 3280 sum1 -= v1[0] * tmp0; 3281 sum2 -= v2[0] * tmp0; 3282 sum3 -= v3[0] * tmp0; 3283 v1++; v2++; v3++; 3284 } 3285 t[row] = sum1; 3286 t[row+1] = sum2; 3287 t[row+2] = sum3; 3288 sz = ii[row+1] - diag[row] - 3; 3289 idx = a->j + diag[row] + 3; 3290 v1 += 3; 3291 v2 += 3; 3292 v3 += 3; 3293 for (n = 0; n<sz-1; n+=2) { 3294 i1 = idx[0]; 3295 i2 = idx[1]; 3296 idx += 2; 3297 tmp0 = x[i1]; 3298 tmp1 = x[i2]; 3299 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3300 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3301 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3302 } 3303 if (n == sz-1) { 3304 tmp0 = x[*idx]; 3305 sum1 -= v1[0] * tmp0; 3306 sum2 -= v2[0] * tmp0; 3307 sum3 -= v3[0] * tmp0; 3308 } 3309 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3310 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3311 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3312 break; 3313 case 4: 3314 v2 = a->a + ii[row+1]; 3315 v3 = a->a + ii[row+2]; 3316 v4 = a->a + ii[row+3]; 3317 sum1 = b[row]; 3318 sum2 = b[row+1]; 3319 sum3 = b[row+2]; 3320 sum4 = b[row+3]; 3321 for (n = 0; n<sz-1; n+=2) { 3322 i1 = idx[0]; 3323 i2 = idx[1]; 3324 idx += 2; 3325 tmp0 = x[i1]; 3326 tmp1 = x[i2]; 3327 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3328 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3329 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3330 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3331 } 3332 if (n == sz-1) { 3333 tmp0 = x[*idx++]; 3334 sum1 -= v1[0] * tmp0; 3335 sum2 -= v2[0] * tmp0; 3336 sum3 -= v3[0] * tmp0; 3337 sum4 -= v4[0] * tmp0; 3338 v1++; v2++; v3++; v4++; 3339 } 3340 t[row] = sum1; 3341 t[row+1] = sum2; 3342 t[row+2] = sum3; 3343 t[row+3] = sum4; 3344 sz = ii[row+1] - diag[row] - 4; 3345 idx = a->j + diag[row] + 4; 3346 v1 += 4; 3347 v2 += 4; 3348 v3 += 4; 3349 v4 += 4; 3350 for (n = 0; n<sz-1; n+=2) { 3351 i1 = idx[0]; 3352 i2 = idx[1]; 3353 idx += 2; 3354 tmp0 = x[i1]; 3355 tmp1 = x[i2]; 3356 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3357 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3358 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3359 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3360 } 3361 if (n == sz-1) { 3362 tmp0 = x[*idx]; 3363 sum1 -= v1[0] * tmp0; 3364 sum2 -= v2[0] * tmp0; 3365 sum3 -= v3[0] * tmp0; 3366 sum4 -= v4[0] * tmp0; 3367 } 3368 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3369 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3370 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3371 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3372 break; 3373 case 5: 3374 v2 = a->a + ii[row+1]; 3375 v3 = a->a + ii[row+2]; 3376 v4 = a->a + ii[row+3]; 3377 v5 = a->a + ii[row+4]; 3378 sum1 = b[row]; 3379 sum2 = b[row+1]; 3380 sum3 = b[row+2]; 3381 sum4 = b[row+3]; 3382 sum5 = b[row+4]; 3383 for (n = 0; n<sz-1; n+=2) { 3384 i1 = idx[0]; 3385 i2 = idx[1]; 3386 idx += 2; 3387 tmp0 = x[i1]; 3388 tmp1 = x[i2]; 3389 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3390 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3391 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3392 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3393 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3394 } 3395 if (n == sz-1) { 3396 tmp0 = x[*idx++]; 3397 sum1 -= v1[0] * tmp0; 3398 sum2 -= v2[0] * tmp0; 3399 sum3 -= v3[0] * tmp0; 3400 sum4 -= v4[0] * tmp0; 3401 sum5 -= v5[0] * tmp0; 3402 v1++; v2++; v3++; v4++; v5++; 3403 } 3404 t[row] = sum1; 3405 t[row+1] = sum2; 3406 t[row+2] = sum3; 3407 t[row+3] = sum4; 3408 t[row+4] = sum5; 3409 sz = ii[row+1] - diag[row] - 5; 3410 idx = a->j + diag[row] + 5; 3411 v1 += 5; 3412 v2 += 5; 3413 v3 += 5; 3414 v4 += 5; 3415 v5 += 5; 3416 for (n = 0; n<sz-1; n+=2) { 3417 i1 = idx[0]; 3418 i2 = idx[1]; 3419 idx += 2; 3420 tmp0 = x[i1]; 3421 tmp1 = x[i2]; 3422 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3423 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3424 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3425 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3426 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3427 } 3428 if (n == sz-1) { 3429 tmp0 = x[*idx]; 3430 sum1 -= v1[0] * tmp0; 3431 sum2 -= v2[0] * tmp0; 3432 sum3 -= v3[0] * tmp0; 3433 sum4 -= v4[0] * tmp0; 3434 sum5 -= v5[0] * tmp0; 3435 } 3436 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3437 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3438 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3439 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3440 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3441 break; 3442 default: 3443 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3444 } 3445 } 3446 xb = t; 3447 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3448 } else xb = b; 3449 3450 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3451 3452 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3453 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3454 ibdiag -= sizes[i]*sizes[i]; 3455 3456 /* set RHS */ 3457 if (xb == b) { 3458 /* whole (old way) */ 3459 sz = ii[row+1] - ii[row]; 3460 idx = a->j + ii[row]; 3461 switch (sizes[i]) { 3462 case 5: 3463 v5 = a->a + ii[row-4]; 3464 case 4: /* fall through */ 3465 v4 = a->a + ii[row-3]; 3466 case 3: 3467 v3 = a->a + ii[row-2]; 3468 case 2: 3469 v2 = a->a + ii[row-1]; 3470 case 1: 3471 v1 = a->a + ii[row]; 3472 break; 3473 default: 3474 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3475 } 3476 } else { 3477 /* upper, no diag */ 3478 sz = ii[row+1] - diag[row] - 1; 3479 idx = a->j + diag[row] + 1; 3480 switch (sizes[i]) { 3481 case 5: 3482 v5 = a->a + diag[row-4] + 5; 3483 case 4: /* fall through */ 3484 v4 = a->a + diag[row-3] + 4; 3485 case 3: 3486 v3 = a->a + diag[row-2] + 3; 3487 case 2: 3488 v2 = a->a + diag[row-1] + 2; 3489 case 1: 3490 v1 = a->a + diag[row] + 1; 3491 } 3492 } 3493 /* set sum */ 3494 switch (sizes[i]) { 3495 case 5: 3496 sum5 = xb[row-4]; 3497 case 4: /* fall through */ 3498 sum4 = xb[row-3]; 3499 case 3: 3500 sum3 = xb[row-2]; 3501 case 2: 3502 sum2 = xb[row-1]; 3503 case 1: 3504 /* note that sum1 is associated with the last row */ 3505 sum1 = xb[row]; 3506 } 3507 /* do sums */ 3508 for (n = 0; n<sz-1; n+=2) { 3509 i1 = idx[0]; 3510 i2 = idx[1]; 3511 idx += 2; 3512 tmp0 = x[i1]; 3513 tmp1 = x[i2]; 3514 switch (sizes[i]) { 3515 case 5: 3516 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3517 case 4: /* fall through */ 3518 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3519 case 3: 3520 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3521 case 2: 3522 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3523 case 1: 3524 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3525 } 3526 } 3527 /* ragged edge */ 3528 if (n == sz-1) { 3529 tmp0 = x[*idx]; 3530 switch (sizes[i]) { 3531 case 5: 3532 sum5 -= *v5*tmp0; 3533 case 4: /* fall through */ 3534 sum4 -= *v4*tmp0; 3535 case 3: 3536 sum3 -= *v3*tmp0; 3537 case 2: 3538 sum2 -= *v2*tmp0; 3539 case 1: 3540 sum1 -= *v1*tmp0; 3541 } 3542 } 3543 /* update */ 3544 if (xb == b) { 3545 /* whole (old way) w/ diag */ 3546 switch (sizes[i]) { 3547 case 5: 3548 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3549 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3550 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3551 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3552 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3553 break; 3554 case 4: 3555 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3556 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3557 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3558 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3559 break; 3560 case 3: 3561 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3562 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3563 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3564 break; 3565 case 2: 3566 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3567 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3568 break; 3569 case 1: 3570 x[row--] += sum1*(*ibdiag); 3571 break; 3572 } 3573 } else { 3574 /* no diag so set = */ 3575 switch (sizes[i]) { 3576 case 5: 3577 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3578 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3579 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3580 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3581 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3582 break; 3583 case 4: 3584 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3585 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3586 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3587 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3588 break; 3589 case 3: 3590 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3591 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3592 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3593 break; 3594 case 2: 3595 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3596 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3597 break; 3598 case 1: 3599 x[row--] = sum1*(*ibdiag); 3600 break; 3601 } 3602 } 3603 } 3604 if (xb == b) { 3605 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3606 } else { 3607 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3608 } 3609 } 3610 } 3611 if (flag & SOR_EISENSTAT) { 3612 /* 3613 Apply (U + D)^-1 where D is now the block diagonal 3614 */ 3615 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3616 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3617 ibdiag -= sizes[i]*sizes[i]; 3618 sz = ii[row+1] - diag[row] - 1; 3619 v1 = a->a + diag[row] + 1; 3620 idx = a->j + diag[row] + 1; 3621 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3622 switch (sizes[i]) { 3623 case 1: 3624 3625 sum1 = b[row]; 3626 for (n = 0; n<sz-1; n+=2) { 3627 i1 = idx[0]; 3628 i2 = idx[1]; 3629 idx += 2; 3630 tmp0 = x[i1]; 3631 tmp1 = x[i2]; 3632 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3633 } 3634 3635 if (n == sz-1) { 3636 tmp0 = x[*idx]; 3637 sum1 -= *v1*tmp0; 3638 } 3639 x[row] = sum1*(*ibdiag);row--; 3640 break; 3641 3642 case 2: 3643 3644 sum1 = b[row]; 3645 sum2 = b[row-1]; 3646 /* note that sum1 is associated with the second of the two rows */ 3647 v2 = a->a + diag[row-1] + 2; 3648 for (n = 0; n<sz-1; n+=2) { 3649 i1 = idx[0]; 3650 i2 = idx[1]; 3651 idx += 2; 3652 tmp0 = x[i1]; 3653 tmp1 = x[i2]; 3654 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3655 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3656 } 3657 3658 if (n == sz-1) { 3659 tmp0 = x[*idx]; 3660 sum1 -= *v1*tmp0; 3661 sum2 -= *v2*tmp0; 3662 } 3663 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3664 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3665 row -= 2; 3666 break; 3667 case 3: 3668 3669 sum1 = b[row]; 3670 sum2 = b[row-1]; 3671 sum3 = b[row-2]; 3672 v2 = a->a + diag[row-1] + 2; 3673 v3 = a->a + diag[row-2] + 3; 3674 for (n = 0; n<sz-1; n+=2) { 3675 i1 = idx[0]; 3676 i2 = idx[1]; 3677 idx += 2; 3678 tmp0 = x[i1]; 3679 tmp1 = x[i2]; 3680 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3681 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3682 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3683 } 3684 3685 if (n == sz-1) { 3686 tmp0 = x[*idx]; 3687 sum1 -= *v1*tmp0; 3688 sum2 -= *v2*tmp0; 3689 sum3 -= *v3*tmp0; 3690 } 3691 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3692 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3693 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3694 row -= 3; 3695 break; 3696 case 4: 3697 3698 sum1 = b[row]; 3699 sum2 = b[row-1]; 3700 sum3 = b[row-2]; 3701 sum4 = b[row-3]; 3702 v2 = a->a + diag[row-1] + 2; 3703 v3 = a->a + diag[row-2] + 3; 3704 v4 = a->a + diag[row-3] + 4; 3705 for (n = 0; n<sz-1; n+=2) { 3706 i1 = idx[0]; 3707 i2 = idx[1]; 3708 idx += 2; 3709 tmp0 = x[i1]; 3710 tmp1 = x[i2]; 3711 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3712 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3713 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3714 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3715 } 3716 3717 if (n == sz-1) { 3718 tmp0 = x[*idx]; 3719 sum1 -= *v1*tmp0; 3720 sum2 -= *v2*tmp0; 3721 sum3 -= *v3*tmp0; 3722 sum4 -= *v4*tmp0; 3723 } 3724 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3725 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3726 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3727 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3728 row -= 4; 3729 break; 3730 case 5: 3731 3732 sum1 = b[row]; 3733 sum2 = b[row-1]; 3734 sum3 = b[row-2]; 3735 sum4 = b[row-3]; 3736 sum5 = b[row-4]; 3737 v2 = a->a + diag[row-1] + 2; 3738 v3 = a->a + diag[row-2] + 3; 3739 v4 = a->a + diag[row-3] + 4; 3740 v5 = a->a + diag[row-4] + 5; 3741 for (n = 0; n<sz-1; n+=2) { 3742 i1 = idx[0]; 3743 i2 = idx[1]; 3744 idx += 2; 3745 tmp0 = x[i1]; 3746 tmp1 = x[i2]; 3747 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3748 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3749 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3750 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3751 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3752 } 3753 3754 if (n == sz-1) { 3755 tmp0 = x[*idx]; 3756 sum1 -= *v1*tmp0; 3757 sum2 -= *v2*tmp0; 3758 sum3 -= *v3*tmp0; 3759 sum4 -= *v4*tmp0; 3760 sum5 -= *v5*tmp0; 3761 } 3762 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3763 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3764 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3765 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3766 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3767 row -= 5; 3768 break; 3769 default: 3770 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3771 } 3772 } 3773 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3774 3775 /* 3776 t = b - D x where D is the block diagonal 3777 */ 3778 cnt = 0; 3779 for (i=0, row=0; i<m; i++) { 3780 switch (sizes[i]) { 3781 case 1: 3782 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3783 break; 3784 case 2: 3785 x1 = x[row]; x2 = x[row+1]; 3786 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3787 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3788 t[row] = b[row] - tmp1; 3789 t[row+1] = b[row+1] - tmp2; row += 2; 3790 cnt += 4; 3791 break; 3792 case 3: 3793 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3794 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3795 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3796 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3797 t[row] = b[row] - tmp1; 3798 t[row+1] = b[row+1] - tmp2; 3799 t[row+2] = b[row+2] - tmp3; row += 3; 3800 cnt += 9; 3801 break; 3802 case 4: 3803 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3804 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3805 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3806 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3807 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3808 t[row] = b[row] - tmp1; 3809 t[row+1] = b[row+1] - tmp2; 3810 t[row+2] = b[row+2] - tmp3; 3811 t[row+3] = b[row+3] - tmp4; row += 4; 3812 cnt += 16; 3813 break; 3814 case 5: 3815 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3816 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3817 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3818 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3819 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3820 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3821 t[row] = b[row] - tmp1; 3822 t[row+1] = b[row+1] - tmp2; 3823 t[row+2] = b[row+2] - tmp3; 3824 t[row+3] = b[row+3] - tmp4; 3825 t[row+4] = b[row+4] - tmp5;row += 5; 3826 cnt += 25; 3827 break; 3828 default: 3829 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3830 } 3831 } 3832 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3833 3834 3835 3836 /* 3837 Apply (L + D)^-1 where D is the block diagonal 3838 */ 3839 for (i=0, row=0; i<m; i++) { 3840 sz = diag[row] - ii[row]; 3841 v1 = a->a + ii[row]; 3842 idx = a->j + ii[row]; 3843 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3844 switch (sizes[i]) { 3845 case 1: 3846 3847 sum1 = t[row]; 3848 for (n = 0; n<sz-1; n+=2) { 3849 i1 = idx[0]; 3850 i2 = idx[1]; 3851 idx += 2; 3852 tmp0 = t[i1]; 3853 tmp1 = t[i2]; 3854 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3855 } 3856 3857 if (n == sz-1) { 3858 tmp0 = t[*idx]; 3859 sum1 -= *v1 * tmp0; 3860 } 3861 x[row] += t[row] = sum1*(*ibdiag++); row++; 3862 break; 3863 case 2: 3864 v2 = a->a + ii[row+1]; 3865 sum1 = t[row]; 3866 sum2 = t[row+1]; 3867 for (n = 0; n<sz-1; n+=2) { 3868 i1 = idx[0]; 3869 i2 = idx[1]; 3870 idx += 2; 3871 tmp0 = t[i1]; 3872 tmp1 = t[i2]; 3873 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3874 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3875 } 3876 3877 if (n == sz-1) { 3878 tmp0 = t[*idx]; 3879 sum1 -= v1[0] * tmp0; 3880 sum2 -= v2[0] * tmp0; 3881 } 3882 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3883 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3884 ibdiag += 4; row += 2; 3885 break; 3886 case 3: 3887 v2 = a->a + ii[row+1]; 3888 v3 = a->a + ii[row+2]; 3889 sum1 = t[row]; 3890 sum2 = t[row+1]; 3891 sum3 = t[row+2]; 3892 for (n = 0; n<sz-1; n+=2) { 3893 i1 = idx[0]; 3894 i2 = idx[1]; 3895 idx += 2; 3896 tmp0 = t[i1]; 3897 tmp1 = t[i2]; 3898 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3899 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3900 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3901 } 3902 3903 if (n == sz-1) { 3904 tmp0 = t[*idx]; 3905 sum1 -= v1[0] * tmp0; 3906 sum2 -= v2[0] * tmp0; 3907 sum3 -= v3[0] * tmp0; 3908 } 3909 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3910 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3911 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3912 ibdiag += 9; row += 3; 3913 break; 3914 case 4: 3915 v2 = a->a + ii[row+1]; 3916 v3 = a->a + ii[row+2]; 3917 v4 = a->a + ii[row+3]; 3918 sum1 = t[row]; 3919 sum2 = t[row+1]; 3920 sum3 = t[row+2]; 3921 sum4 = t[row+3]; 3922 for (n = 0; n<sz-1; n+=2) { 3923 i1 = idx[0]; 3924 i2 = idx[1]; 3925 idx += 2; 3926 tmp0 = t[i1]; 3927 tmp1 = t[i2]; 3928 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3929 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3930 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3931 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3932 } 3933 3934 if (n == sz-1) { 3935 tmp0 = t[*idx]; 3936 sum1 -= v1[0] * tmp0; 3937 sum2 -= v2[0] * tmp0; 3938 sum3 -= v3[0] * tmp0; 3939 sum4 -= v4[0] * tmp0; 3940 } 3941 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3942 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3943 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3944 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3945 ibdiag += 16; row += 4; 3946 break; 3947 case 5: 3948 v2 = a->a + ii[row+1]; 3949 v3 = a->a + ii[row+2]; 3950 v4 = a->a + ii[row+3]; 3951 v5 = a->a + ii[row+4]; 3952 sum1 = t[row]; 3953 sum2 = t[row+1]; 3954 sum3 = t[row+2]; 3955 sum4 = t[row+3]; 3956 sum5 = t[row+4]; 3957 for (n = 0; n<sz-1; n+=2) { 3958 i1 = idx[0]; 3959 i2 = idx[1]; 3960 idx += 2; 3961 tmp0 = t[i1]; 3962 tmp1 = t[i2]; 3963 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3964 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3965 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3966 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3967 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3968 } 3969 3970 if (n == sz-1) { 3971 tmp0 = t[*idx]; 3972 sum1 -= v1[0] * tmp0; 3973 sum2 -= v2[0] * tmp0; 3974 sum3 -= v3[0] * tmp0; 3975 sum4 -= v4[0] * tmp0; 3976 sum5 -= v5[0] * tmp0; 3977 } 3978 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3979 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3980 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3981 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3982 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3983 ibdiag += 25; row += 5; 3984 break; 3985 default: 3986 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3987 } 3988 } 3989 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3990 } 3991 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3992 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3993 PetscFunctionReturn(0); 3994 } 3995 3996 #undef __FUNCT__ 3997 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3998 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3999 { 4000 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4001 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 4002 const MatScalar *bdiag = a->inode.bdiag; 4003 const PetscScalar *b; 4004 PetscErrorCode ierr; 4005 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4006 const PetscInt *sizes = a->inode.size; 4007 4008 PetscFunctionBegin; 4009 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4010 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4011 cnt = 0; 4012 for (i=0, row=0; i<m; i++) { 4013 switch (sizes[i]) { 4014 case 1: 4015 x[row] = b[row]*bdiag[cnt++];row++; 4016 break; 4017 case 2: 4018 x1 = b[row]; x2 = b[row+1]; 4019 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4020 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4021 x[row++] = tmp1; 4022 x[row++] = tmp2; 4023 cnt += 4; 4024 break; 4025 case 3: 4026 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4027 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4028 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4029 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4030 x[row++] = tmp1; 4031 x[row++] = tmp2; 4032 x[row++] = tmp3; 4033 cnt += 9; 4034 break; 4035 case 4: 4036 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4037 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4038 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4039 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4040 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4041 x[row++] = tmp1; 4042 x[row++] = tmp2; 4043 x[row++] = tmp3; 4044 x[row++] = tmp4; 4045 cnt += 16; 4046 break; 4047 case 5: 4048 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4049 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4050 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4051 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4052 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4053 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4054 x[row++] = tmp1; 4055 x[row++] = tmp2; 4056 x[row++] = tmp3; 4057 x[row++] = tmp4; 4058 x[row++] = tmp5; 4059 cnt += 25; 4060 break; 4061 default: 4062 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4063 } 4064 } 4065 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4066 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4067 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4068 PetscFunctionReturn(0); 4069 } 4070 4071 /* 4072 samestructure indicates that the matrix has not changed its nonzero structure so we 4073 do not need to recompute the inodes 4074 */ 4075 #undef __FUNCT__ 4076 #define __FUNCT__ "MatSeqAIJCheckInode" 4077 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4078 { 4079 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4080 PetscErrorCode ierr; 4081 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4082 PetscBool flag; 4083 const PetscInt *idx,*idy,*ii; 4084 4085 PetscFunctionBegin; 4086 if (!a->inode.use) PetscFunctionReturn(0); 4087 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4088 4089 m = A->rmap->n; 4090 if (a->inode.size) ns = a->inode.size; 4091 else { 4092 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4093 } 4094 4095 i = 0; 4096 node_count = 0; 4097 idx = a->j; 4098 ii = a->i; 4099 while (i < m) { /* For each row */ 4100 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4101 /* Limits the number of elements in a node to 'a->inode.limit' */ 4102 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4103 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4104 if (nzy != nzx) break; 4105 idy += nzx; /* Same nonzero pattern */ 4106 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4107 if (!flag) break; 4108 } 4109 ns[node_count++] = blk_size; 4110 idx += blk_size*nzx; 4111 i = j; 4112 } 4113 /* If not enough inodes found,, do not use inode version of the routines */ 4114 if (!m || node_count > .8*m) { 4115 ierr = PetscFree(ns);CHKERRQ(ierr); 4116 4117 a->inode.node_count = 0; 4118 a->inode.size = NULL; 4119 a->inode.use = PETSC_FALSE; 4120 A->ops->mult = MatMult_SeqAIJ; 4121 A->ops->sor = MatSOR_SeqAIJ; 4122 A->ops->multadd = MatMultAdd_SeqAIJ; 4123 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4124 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4125 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4126 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4127 A->ops->coloringpatch = 0; 4128 A->ops->multdiagonalblock = 0; 4129 4130 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4131 } else { 4132 if (!A->factortype) { 4133 A->ops->mult = MatMult_SeqAIJ_Inode; 4134 A->ops->sor = MatSOR_SeqAIJ_Inode; 4135 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4136 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4137 if (A->rmap->n == A->cmap->n) { 4138 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4139 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4140 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4141 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4142 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4143 } 4144 } else { 4145 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4146 } 4147 a->inode.node_count = node_count; 4148 a->inode.size = ns; 4149 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4150 } 4151 a->inode.checked = PETSC_TRUE; 4152 a->inode.mat_nonzerostate = A->nonzerostate; 4153 PetscFunctionReturn(0); 4154 } 4155 4156 #undef __FUNCT__ 4157 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4158 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4159 { 4160 Mat B =*C; 4161 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4162 PetscErrorCode ierr; 4163 PetscInt m=A->rmap->n; 4164 4165 PetscFunctionBegin; 4166 c->inode.use = a->inode.use; 4167 c->inode.limit = a->inode.limit; 4168 c->inode.max_limit = a->inode.max_limit; 4169 if (a->inode.size) { 4170 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4171 c->inode.node_count = a->inode.node_count; 4172 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4173 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4174 if (!B->factortype) { 4175 B->ops->mult = MatMult_SeqAIJ_Inode; 4176 B->ops->sor = MatSOR_SeqAIJ_Inode; 4177 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4178 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4179 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4180 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4181 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4182 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4183 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4184 } else { 4185 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4186 } 4187 } else { 4188 c->inode.size = 0; 4189 c->inode.node_count = 0; 4190 } 4191 c->inode.ibdiagvalid = PETSC_FALSE; 4192 c->inode.ibdiag = 0; 4193 c->inode.bdiag = 0; 4194 PetscFunctionReturn(0); 4195 } 4196 4197 #undef __FUNCT__ 4198 #define __FUNCT__ "MatGetRow_FactoredLU" 4199 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4200 { 4201 PetscInt k; 4202 const PetscInt *vi; 4203 4204 PetscFunctionBegin; 4205 vi = aj + ai[row]; 4206 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4207 vi = aj + adiag[row]; 4208 cols[nzl] = vi[0]; 4209 vi = aj + adiag[row+1]+1; 4210 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4211 PetscFunctionReturn(0); 4212 } 4213 /* 4214 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4215 Modified from MatSeqAIJCheckInode(). 4216 4217 Input Parameters: 4218 . Mat A - ILU or LU matrix factor 4219 4220 */ 4221 #undef __FUNCT__ 4222 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU" 4223 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4224 { 4225 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4226 PetscErrorCode ierr; 4227 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4228 PetscInt *cols1,*cols2,*ns; 4229 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4230 PetscBool flag; 4231 4232 PetscFunctionBegin; 4233 if (!a->inode.use) PetscFunctionReturn(0); 4234 if (a->inode.checked) PetscFunctionReturn(0); 4235 4236 m = A->rmap->n; 4237 if (a->inode.size) ns = a->inode.size; 4238 else { 4239 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4240 } 4241 4242 i = 0; 4243 node_count = 0; 4244 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4245 while (i < m) { /* For each row */ 4246 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4247 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4248 nzx = nzl1 + nzu1 + 1; 4249 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4250 4251 /* Limits the number of elements in a node to 'a->inode.limit' */ 4252 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4253 nzl2 = ai[j+1] - ai[j]; 4254 nzu2 = adiag[j] - adiag[j+1] - 1; 4255 nzy = nzl2 + nzu2 + 1; 4256 if (nzy != nzx) break; 4257 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4258 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4259 if (!flag) break; 4260 } 4261 ns[node_count++] = blk_size; 4262 i = j; 4263 } 4264 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4265 /* If not enough inodes found,, do not use inode version of the routines */ 4266 if (!m || node_count > .8*m) { 4267 ierr = PetscFree(ns);CHKERRQ(ierr); 4268 4269 a->inode.node_count = 0; 4270 a->inode.size = NULL; 4271 a->inode.use = PETSC_FALSE; 4272 4273 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4274 } else { 4275 A->ops->mult = 0; 4276 A->ops->sor = 0; 4277 A->ops->multadd = 0; 4278 A->ops->getrowij = 0; 4279 A->ops->restorerowij = 0; 4280 A->ops->getcolumnij = 0; 4281 A->ops->restorecolumnij = 0; 4282 A->ops->coloringpatch = 0; 4283 A->ops->multdiagonalblock = 0; 4284 a->inode.node_count = node_count; 4285 a->inode.size = ns; 4286 4287 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4288 } 4289 a->inode.checked = PETSC_TRUE; 4290 PetscFunctionReturn(0); 4291 } 4292 4293 #undef __FUNCT__ 4294 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4295 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4296 { 4297 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4298 4299 PetscFunctionBegin; 4300 a->inode.ibdiagvalid = PETSC_FALSE; 4301 PetscFunctionReturn(0); 4302 } 4303 4304 /* 4305 This is really ugly. if inodes are used this replaces the 4306 permutations with ones that correspond to rows/cols of the matrix 4307 rather then inode blocks 4308 */ 4309 #undef __FUNCT__ 4310 #define __FUNCT__ "MatInodeAdjustForInodes" 4311 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4312 { 4313 PetscErrorCode ierr; 4314 4315 PetscFunctionBegin; 4316 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4317 PetscFunctionReturn(0); 4318 } 4319 4320 #undef __FUNCT__ 4321 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4322 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4323 { 4324 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4325 PetscErrorCode ierr; 4326 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4327 const PetscInt *ridx,*cidx; 4328 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4329 PetscInt nslim_col,*ns_col; 4330 IS ris = *rperm,cis = *cperm; 4331 4332 PetscFunctionBegin; 4333 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4334 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4335 4336 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4337 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4338 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4339 4340 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4341 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4342 4343 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4344 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4345 4346 /* Construct the permutations for rows*/ 4347 for (i=0,row = 0; i<nslim_row; ++i) { 4348 indx = ridx[i]; 4349 start_val = tns[indx]; 4350 end_val = tns[indx + 1]; 4351 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4352 } 4353 4354 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4355 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4356 4357 /* Construct permutations for columns */ 4358 for (i=0,col=0; i<nslim_col; ++i) { 4359 indx = cidx[i]; 4360 start_val = tns[indx]; 4361 end_val = tns[indx + 1]; 4362 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4363 } 4364 4365 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4366 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4367 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4368 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4369 4370 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4371 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4372 4373 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4374 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4375 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4376 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4377 ierr = PetscFree(tns);CHKERRQ(ierr); 4378 PetscFunctionReturn(0); 4379 } 4380 4381 #undef __FUNCT__ 4382 #define __FUNCT__ "MatInodeGetInodeSizes" 4383 /*@C 4384 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4385 4386 Not Collective 4387 4388 Input Parameter: 4389 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4390 4391 Output Parameter: 4392 + node_count - no of inodes present in the matrix. 4393 . sizes - an array of size node_count,with sizes of each inode. 4394 - limit - the max size used to generate the inodes. 4395 4396 Level: advanced 4397 4398 Notes: This routine returns some internal storage information 4399 of the matrix, it is intended to be used by advanced users. 4400 It should be called after the matrix is assembled. 4401 The contents of the sizes[] array should not be changed. 4402 NULL may be passed for information not requested. 4403 4404 .keywords: matrix, seqaij, get, inode 4405 4406 .seealso: MatGetInfo() 4407 @*/ 4408 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4409 { 4410 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4411 4412 PetscFunctionBegin; 4413 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4414 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4415 if (f) { 4416 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4417 } 4418 PetscFunctionReturn(0); 4419 } 4420 4421 #undef __FUNCT__ 4422 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4423 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4424 { 4425 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4426 4427 PetscFunctionBegin; 4428 if (node_count) *node_count = a->inode.node_count; 4429 if (sizes) *sizes = a->inode.size; 4430 if (limit) *limit = a->inode.limit; 4431 PetscFunctionReturn(0); 4432 } 4433