1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 13 14 PetscFunctionBegin; 15 n = A->cmap->n; 16 m = A->rmap->n; 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 22 for (; count+1 < n; count++,i++) ; 23 if (count < n) { 24 i++; 25 } 26 *size = i; 27 PetscFunctionReturn(0); 28 } 29 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 33 ns_col[i] = ns_row[i]; 34 } 35 36 /* if m < n; pad up the remainder with inode_limit */ 37 for (; count+1 < n; count++,i++) { 38 ns_col[i] = 1; 39 } 40 /* The last node is the odd ball. padd it up with the remaining rows; */ 41 if (count < n) { 42 ns_col[i] = n - count; 43 i++; 44 } else if (count > n) { 45 /* Adjust for the over estimation */ 46 ns_col[i-1] += n - count; 47 } 48 *size = i; 49 *ns = ns_col; 50 PetscFunctionReturn(0); 51 } 52 53 54 /* 55 This builds symmetric version of nonzero structure, 56 */ 57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 58 { 59 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 60 PetscErrorCode ierr; 61 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 62 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 63 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 64 65 PetscFunctionBegin; 66 nslim_row = a->inode.node_count; 67 m = A->rmap->n; 68 n = A->cmap->n; 69 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 70 71 /* Use the row_inode as column_inode */ 72 nslim_col = nslim_row; 73 ns_col = ns_row; 74 75 /* allocate space for reformated inode structure */ 76 ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr); 77 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 78 79 for (i1=0,col=0; i1<nslim_col; ++i1) { 80 nsz = ns_col[i1]; 81 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 82 } 83 /* allocate space for row pointers */ 84 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 85 *iia = ia; 86 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 87 88 /* determine the number of columns in each row */ 89 ia[0] = oshift; 90 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 91 92 j = aj + ai[row] + ishift; 93 jmax = aj + ai[row+1] + ishift; 94 if (j==jmax) continue; /* empty row */ 95 col = *j++ + ishift; 96 i2 = tvc[col]; 97 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 98 ia[i1+1]++; 99 ia[i2+1]++; 100 i2++; /* Start col of next node */ 101 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 102 i2 = tvc[col]; 103 } 104 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 105 } 106 107 /* shift ia[i] to point to next row */ 108 for (i1=1; i1<nslim_row+1; i1++) { 109 row = ia[i1-1]; 110 ia[i1] += row; 111 work[i1-1] = row - oshift; 112 } 113 114 /* allocate space for column pointers */ 115 nz = ia[nslim_row] + (!ishift); 116 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 117 *jja = ja; 118 119 /* loop over lower triangular part putting into ja */ 120 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 121 j = aj + ai[row] + ishift; 122 jmax = aj + ai[row+1] + ishift; 123 if (j==jmax) continue; /* empty row */ 124 col = *j++ + ishift; 125 i2 = tvc[col]; 126 while (i2<i1 && j<jmax) { 127 ja[work[i2]++] = i1 + oshift; 128 ja[work[i1]++] = i2 + oshift; 129 ++i2; 130 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 131 i2 = tvc[col]; 132 } 133 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 134 135 } 136 ierr = PetscFree(work);CHKERRQ(ierr); 137 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 138 PetscFunctionReturn(0); 139 } 140 141 /* 142 This builds nonsymmetric version of nonzero structure, 143 */ 144 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 145 { 146 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 147 PetscErrorCode ierr; 148 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 149 PetscInt *tns,*tvc,nsz,i1,i2; 150 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 151 152 PetscFunctionBegin; 153 nslim_row = a->inode.node_count; 154 n = A->cmap->n; 155 156 /* Create The column_inode for this matrix */ 157 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 158 159 /* allocate space for reformated column_inode structure */ 160 ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr); 161 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 162 163 for (i1=0,col=0; i1<nslim_col; ++i1) { 164 nsz = ns_col[i1]; 165 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 166 } 167 /* allocate space for row pointers */ 168 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 169 *iia = ia; 170 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 171 172 /* determine the number of columns in each row */ 173 ia[0] = oshift; 174 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 175 j = aj + ai[row] + ishift; 176 nz = ai[row+1] - ai[row]; 177 if (!nz) continue; /* empty row */ 178 col = *j++ + ishift; 179 i2 = tvc[col]; 180 while (nz-- > 0) { /* off-diagonal elemets */ 181 ia[i1+1]++; 182 i2++; /* Start col of next node */ 183 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 184 if (nz > 0) i2 = tvc[col]; 185 } 186 } 187 188 /* shift ia[i] to point to next row */ 189 for (i1=1; i1<nslim_row+1; i1++) { 190 row = ia[i1-1]; 191 ia[i1] += row; 192 work[i1-1] = row - oshift; 193 } 194 195 /* allocate space for column pointers */ 196 nz = ia[nslim_row] + (!ishift); 197 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 198 *jja = ja; 199 200 /* loop over matrix putting into ja */ 201 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 202 j = aj + ai[row] + ishift; 203 nz = ai[row+1] - ai[row]; 204 if (!nz) continue; /* empty row */ 205 col = *j++ + ishift; 206 i2 = tvc[col]; 207 while (nz-- > 0) { 208 ja[work[i1]++] = i2 + oshift; 209 ++i2; 210 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 211 if (nz > 0) i2 = tvc[col]; 212 } 213 } 214 ierr = PetscFree(ns_col);CHKERRQ(ierr); 215 ierr = PetscFree(work);CHKERRQ(ierr); 216 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 217 PetscFunctionReturn(0); 218 } 219 220 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 221 { 222 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 223 PetscErrorCode ierr; 224 225 PetscFunctionBegin; 226 *n = a->inode.node_count; 227 if (!ia) PetscFunctionReturn(0); 228 if (!blockcompressed) { 229 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 230 } else if (symmetric) { 231 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 232 } else { 233 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 234 } 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 if (!ia) PetscFunctionReturn(0); 244 245 if (!blockcompressed) { 246 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 247 } else { 248 ierr = PetscFree(*ia);CHKERRQ(ierr); 249 ierr = PetscFree(*ja);CHKERRQ(ierr); 250 } 251 PetscFunctionReturn(0); 252 } 253 254 /* ----------------------------------------------------------- */ 255 256 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 257 { 258 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 259 PetscErrorCode ierr; 260 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 261 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 262 263 PetscFunctionBegin; 264 nslim_row = a->inode.node_count; 265 n = A->cmap->n; 266 267 /* Create The column_inode for this matrix */ 268 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 269 270 /* allocate space for reformated column_inode structure */ 271 ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr); 272 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 273 274 for (i1=0,col=0; i1<nslim_col; ++i1) { 275 nsz = ns_col[i1]; 276 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 277 } 278 /* allocate space for column pointers */ 279 ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 280 *iia = ia; 281 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 282 283 /* determine the number of columns in each row */ 284 ia[0] = oshift; 285 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 286 j = aj + ai[row] + ishift; 287 col = *j++ + ishift; 288 i2 = tvc[col]; 289 nz = ai[row+1] - ai[row]; 290 while (nz-- > 0) { /* off-diagonal elemets */ 291 /* ia[i1+1]++; */ 292 ia[i2+1]++; 293 i2++; 294 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 295 if (nz > 0) i2 = tvc[col]; 296 } 297 } 298 299 /* shift ia[i] to point to next col */ 300 for (i1=1; i1<nslim_col+1; i1++) { 301 col = ia[i1-1]; 302 ia[i1] += col; 303 work[i1-1] = col - oshift; 304 } 305 306 /* allocate space for column pointers */ 307 nz = ia[nslim_col] + (!ishift); 308 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 309 *jja = ja; 310 311 /* loop over matrix putting into ja */ 312 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 313 j = aj + ai[row] + ishift; 314 col = *j++ + ishift; 315 i2 = tvc[col]; 316 nz = ai[row+1] - ai[row]; 317 while (nz-- > 0) { 318 /* ja[work[i1]++] = i2 + oshift; */ 319 ja[work[i2]++] = i1 + oshift; 320 i2++; 321 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 322 if (nz > 0) i2 = tvc[col]; 323 } 324 } 325 ierr = PetscFree(ns_col);CHKERRQ(ierr); 326 ierr = PetscFree(work);CHKERRQ(ierr); 327 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 332 { 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr); 337 if (!ia) PetscFunctionReturn(0); 338 339 if (!blockcompressed) { 340 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 341 } else if (symmetric) { 342 /* Since the indices are symmetric it does'nt matter */ 343 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 344 } else { 345 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 346 } 347 PetscFunctionReturn(0); 348 } 349 350 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 if (!ia) PetscFunctionReturn(0); 356 if (!blockcompressed) { 357 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 358 } else { 359 ierr = PetscFree(*ia);CHKERRQ(ierr); 360 ierr = PetscFree(*ja);CHKERRQ(ierr); 361 } 362 PetscFunctionReturn(0); 363 } 364 365 /* ----------------------------------------------------------- */ 366 367 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 368 { 369 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 370 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 371 PetscScalar *y; 372 const PetscScalar *x; 373 const MatScalar *v1,*v2,*v3,*v4,*v5; 374 PetscErrorCode ierr; 375 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 376 const PetscInt *idx,*ns,*ii; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 380 #endif 381 382 PetscFunctionBegin; 383 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 384 node_max = a->inode.node_count; 385 ns = a->inode.size; /* Node Size array */ 386 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 387 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 388 idx = a->j; 389 v1 = a->a; 390 ii = a->i; 391 392 for (i = 0,row = 0; i< node_max; ++i) { 393 nsz = ns[i]; 394 n = ii[1] - ii[0]; 395 nonzerorow += (n>0)*nsz; 396 ii += nsz; 397 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 398 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 399 sz = n; /* No of non zeros in this row */ 400 /* Switch on the size of Node */ 401 switch (nsz) { /* Each loop in 'case' is unrolled */ 402 case 1: 403 sum1 = 0.; 404 405 for (n = 0; n< sz-1; n+=2) { 406 i1 = idx[0]; /* The instructions are ordered to */ 407 i2 = idx[1]; /* make the compiler's job easy */ 408 idx += 2; 409 tmp0 = x[i1]; 410 tmp1 = x[i2]; 411 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 412 } 413 414 if (n == sz-1) { /* Take care of the last nonzero */ 415 tmp0 = x[*idx++]; 416 sum1 += *v1++ *tmp0; 417 } 418 y[row++]=sum1; 419 break; 420 case 2: 421 sum1 = 0.; 422 sum2 = 0.; 423 v2 = v1 + n; 424 425 for (n = 0; n< sz-1; n+=2) { 426 i1 = idx[0]; 427 i2 = idx[1]; 428 idx += 2; 429 tmp0 = x[i1]; 430 tmp1 = x[i2]; 431 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 432 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 433 } 434 if (n == sz-1) { 435 tmp0 = x[*idx++]; 436 sum1 += *v1++ * tmp0; 437 sum2 += *v2++ * tmp0; 438 } 439 y[row++]=sum1; 440 y[row++]=sum2; 441 v1 =v2; /* Since the next block to be processed starts there*/ 442 idx +=sz; 443 break; 444 case 3: 445 sum1 = 0.; 446 sum2 = 0.; 447 sum3 = 0.; 448 v2 = v1 + n; 449 v3 = v2 + n; 450 451 for (n = 0; n< sz-1; n+=2) { 452 i1 = idx[0]; 453 i2 = idx[1]; 454 idx += 2; 455 tmp0 = x[i1]; 456 tmp1 = x[i2]; 457 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 458 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 459 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 460 } 461 if (n == sz-1) { 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 sum3 += *v3++ * tmp0; 466 } 467 y[row++]=sum1; 468 y[row++]=sum2; 469 y[row++]=sum3; 470 v1 =v3; /* Since the next block to be processed starts there*/ 471 idx +=2*sz; 472 break; 473 case 4: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 sum4 = 0.; 478 v2 = v1 + n; 479 v3 = v2 + n; 480 v4 = v3 + n; 481 482 for (n = 0; n< sz-1; n+=2) { 483 i1 = idx[0]; 484 i2 = idx[1]; 485 idx += 2; 486 tmp0 = x[i1]; 487 tmp1 = x[i2]; 488 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 489 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 490 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 491 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 492 } 493 if (n == sz-1) { 494 tmp0 = x[*idx++]; 495 sum1 += *v1++ * tmp0; 496 sum2 += *v2++ * tmp0; 497 sum3 += *v3++ * tmp0; 498 sum4 += *v4++ * tmp0; 499 } 500 y[row++]=sum1; 501 y[row++]=sum2; 502 y[row++]=sum3; 503 y[row++]=sum4; 504 v1 =v4; /* Since the next block to be processed starts there*/ 505 idx +=3*sz; 506 break; 507 case 5: 508 sum1 = 0.; 509 sum2 = 0.; 510 sum3 = 0.; 511 sum4 = 0.; 512 sum5 = 0.; 513 v2 = v1 + n; 514 v3 = v2 + n; 515 v4 = v3 + n; 516 v5 = v4 + n; 517 518 for (n = 0; n<sz-1; n+=2) { 519 i1 = idx[0]; 520 i2 = idx[1]; 521 idx += 2; 522 tmp0 = x[i1]; 523 tmp1 = x[i2]; 524 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 525 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 526 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 527 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 528 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 529 } 530 if (n == sz-1) { 531 tmp0 = x[*idx++]; 532 sum1 += *v1++ * tmp0; 533 sum2 += *v2++ * tmp0; 534 sum3 += *v3++ * tmp0; 535 sum4 += *v4++ * tmp0; 536 sum5 += *v5++ * tmp0; 537 } 538 y[row++]=sum1; 539 y[row++]=sum2; 540 y[row++]=sum3; 541 y[row++]=sum4; 542 y[row++]=sum5; 543 v1 =v5; /* Since the next block to be processed starts there */ 544 idx +=4*sz; 545 break; 546 default: 547 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 548 } 549 } 550 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 551 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 552 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 553 PetscFunctionReturn(0); 554 } 555 /* ----------------------------------------------------------- */ 556 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 557 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 558 { 559 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 560 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 561 const MatScalar *v1,*v2,*v3,*v4,*v5; 562 const PetscScalar *x; 563 PetscScalar *y,*z,*zt; 564 PetscErrorCode ierr; 565 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 566 const PetscInt *idx,*ns,*ii; 567 568 PetscFunctionBegin; 569 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 570 node_max = a->inode.node_count; 571 ns = a->inode.size; /* Node Size array */ 572 573 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 575 zt = z; 576 577 idx = a->j; 578 v1 = a->a; 579 ii = a->i; 580 581 for (i = 0,row = 0; i< node_max; ++i) { 582 nsz = ns[i]; 583 n = ii[1] - ii[0]; 584 ii += nsz; 585 sz = n; /* No of non zeros in this row */ 586 /* Switch on the size of Node */ 587 switch (nsz) { /* Each loop in 'case' is unrolled */ 588 case 1: 589 sum1 = *zt++; 590 591 for (n = 0; n< sz-1; n+=2) { 592 i1 = idx[0]; /* The instructions are ordered to */ 593 i2 = idx[1]; /* make the compiler's job easy */ 594 idx += 2; 595 tmp0 = x[i1]; 596 tmp1 = x[i2]; 597 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 598 } 599 600 if (n == sz-1) { /* Take care of the last nonzero */ 601 tmp0 = x[*idx++]; 602 sum1 += *v1++ * tmp0; 603 } 604 y[row++]=sum1; 605 break; 606 case 2: 607 sum1 = *zt++; 608 sum2 = *zt++; 609 v2 = v1 + n; 610 611 for (n = 0; n< sz-1; n+=2) { 612 i1 = idx[0]; 613 i2 = idx[1]; 614 idx += 2; 615 tmp0 = x[i1]; 616 tmp1 = x[i2]; 617 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 618 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 619 } 620 if (n == sz-1) { 621 tmp0 = x[*idx++]; 622 sum1 += *v1++ * tmp0; 623 sum2 += *v2++ * tmp0; 624 } 625 y[row++]=sum1; 626 y[row++]=sum2; 627 v1 =v2; /* Since the next block to be processed starts there*/ 628 idx +=sz; 629 break; 630 case 3: 631 sum1 = *zt++; 632 sum2 = *zt++; 633 sum3 = *zt++; 634 v2 = v1 + n; 635 v3 = v2 + n; 636 637 for (n = 0; n< sz-1; n+=2) { 638 i1 = idx[0]; 639 i2 = idx[1]; 640 idx += 2; 641 tmp0 = x[i1]; 642 tmp1 = x[i2]; 643 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 644 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 645 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 646 } 647 if (n == sz-1) { 648 tmp0 = x[*idx++]; 649 sum1 += *v1++ * tmp0; 650 sum2 += *v2++ * tmp0; 651 sum3 += *v3++ * tmp0; 652 } 653 y[row++]=sum1; 654 y[row++]=sum2; 655 y[row++]=sum3; 656 v1 =v3; /* Since the next block to be processed starts there*/ 657 idx +=2*sz; 658 break; 659 case 4: 660 sum1 = *zt++; 661 sum2 = *zt++; 662 sum3 = *zt++; 663 sum4 = *zt++; 664 v2 = v1 + n; 665 v3 = v2 + n; 666 v4 = v3 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 677 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 678 } 679 if (n == sz-1) { 680 tmp0 = x[*idx++]; 681 sum1 += *v1++ * tmp0; 682 sum2 += *v2++ * tmp0; 683 sum3 += *v3++ * tmp0; 684 sum4 += *v4++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 y[row++]=sum4; 690 v1 =v4; /* Since the next block to be processed starts there*/ 691 idx +=3*sz; 692 break; 693 case 5: 694 sum1 = *zt++; 695 sum2 = *zt++; 696 sum3 = *zt++; 697 sum4 = *zt++; 698 sum5 = *zt++; 699 v2 = v1 + n; 700 v3 = v2 + n; 701 v4 = v3 + n; 702 v5 = v4 + n; 703 704 for (n = 0; n<sz-1; n+=2) { 705 i1 = idx[0]; 706 i2 = idx[1]; 707 idx += 2; 708 tmp0 = x[i1]; 709 tmp1 = x[i2]; 710 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 711 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 712 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 713 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 714 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 715 } 716 if (n == sz-1) { 717 tmp0 = x[*idx++]; 718 sum1 += *v1++ * tmp0; 719 sum2 += *v2++ * tmp0; 720 sum3 += *v3++ * tmp0; 721 sum4 += *v4++ * tmp0; 722 sum5 += *v5++ * tmp0; 723 } 724 y[row++]=sum1; 725 y[row++]=sum2; 726 y[row++]=sum3; 727 y[row++]=sum4; 728 y[row++]=sum5; 729 v1 =v5; /* Since the next block to be processed starts there */ 730 idx +=4*sz; 731 break; 732 default: 733 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 734 } 735 } 736 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 737 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 738 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 739 PetscFunctionReturn(0); 740 } 741 742 /* ----------------------------------------------------------- */ 743 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 744 { 745 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 746 IS iscol = a->col,isrow = a->row; 747 PetscErrorCode ierr; 748 const PetscInt *r,*c,*rout,*cout; 749 PetscInt i,j,n = A->rmap->n,nz; 750 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 751 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 752 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 753 PetscScalar sum1,sum2,sum3,sum4,sum5; 754 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 755 const PetscScalar *b; 756 757 PetscFunctionBegin; 758 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 759 node_max = a->inode.node_count; 760 ns = a->inode.size; /* Node Size array */ 761 762 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 763 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 764 tmp = a->solve_work; 765 766 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 767 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 768 769 /* forward solve the lower triangular */ 770 tmps = tmp; 771 aa = a_a; 772 aj = a_j; 773 ad = a->diag; 774 775 for (i = 0,row = 0; i< node_max; ++i) { 776 nsz = ns[i]; 777 aii = ai[row]; 778 v1 = aa + aii; 779 vi = aj + aii; 780 nz = ad[row]- aii; 781 if (i < node_max-1) { 782 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 783 * but our indexing to determine it's size could. */ 784 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 785 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 786 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 787 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 788 } 789 790 switch (nsz) { /* Each loop in 'case' is unrolled */ 791 case 1: 792 sum1 = b[*r++]; 793 for (j=0; j<nz-1; j+=2) { 794 i0 = vi[0]; 795 i1 = vi[1]; 796 vi +=2; 797 tmp0 = tmps[i0]; 798 tmp1 = tmps[i1]; 799 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 800 } 801 if (j == nz-1) { 802 tmp0 = tmps[*vi++]; 803 sum1 -= *v1++ *tmp0; 804 } 805 tmp[row++]=sum1; 806 break; 807 case 2: 808 sum1 = b[*r++]; 809 sum2 = b[*r++]; 810 v2 = aa + ai[row+1]; 811 812 for (j=0; j<nz-1; j+=2) { 813 i0 = vi[0]; 814 i1 = vi[1]; 815 vi +=2; 816 tmp0 = tmps[i0]; 817 tmp1 = tmps[i1]; 818 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 819 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 820 } 821 if (j == nz-1) { 822 tmp0 = tmps[*vi++]; 823 sum1 -= *v1++ *tmp0; 824 sum2 -= *v2++ *tmp0; 825 } 826 sum2 -= *v2++ *sum1; 827 tmp[row++]=sum1; 828 tmp[row++]=sum2; 829 break; 830 case 3: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 sum3 = b[*r++]; 834 v2 = aa + ai[row+1]; 835 v3 = aa + ai[row+2]; 836 837 for (j=0; j<nz-1; j+=2) { 838 i0 = vi[0]; 839 i1 = vi[1]; 840 vi +=2; 841 tmp0 = tmps[i0]; 842 tmp1 = tmps[i1]; 843 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 844 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 845 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 846 } 847 if (j == nz-1) { 848 tmp0 = tmps[*vi++]; 849 sum1 -= *v1++ *tmp0; 850 sum2 -= *v2++ *tmp0; 851 sum3 -= *v3++ *tmp0; 852 } 853 sum2 -= *v2++ * sum1; 854 sum3 -= *v3++ * sum1; 855 sum3 -= *v3++ * sum2; 856 857 tmp[row++]=sum1; 858 tmp[row++]=sum2; 859 tmp[row++]=sum3; 860 break; 861 862 case 4: 863 sum1 = b[*r++]; 864 sum2 = b[*r++]; 865 sum3 = b[*r++]; 866 sum4 = b[*r++]; 867 v2 = aa + ai[row+1]; 868 v3 = aa + ai[row+2]; 869 v4 = aa + ai[row+3]; 870 871 for (j=0; j<nz-1; j+=2) { 872 i0 = vi[0]; 873 i1 = vi[1]; 874 vi +=2; 875 tmp0 = tmps[i0]; 876 tmp1 = tmps[i1]; 877 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 878 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 879 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 880 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 881 } 882 if (j == nz-1) { 883 tmp0 = tmps[*vi++]; 884 sum1 -= *v1++ *tmp0; 885 sum2 -= *v2++ *tmp0; 886 sum3 -= *v3++ *tmp0; 887 sum4 -= *v4++ *tmp0; 888 } 889 sum2 -= *v2++ * sum1; 890 sum3 -= *v3++ * sum1; 891 sum4 -= *v4++ * sum1; 892 sum3 -= *v3++ * sum2; 893 sum4 -= *v4++ * sum2; 894 sum4 -= *v4++ * sum3; 895 896 tmp[row++]=sum1; 897 tmp[row++]=sum2; 898 tmp[row++]=sum3; 899 tmp[row++]=sum4; 900 break; 901 case 5: 902 sum1 = b[*r++]; 903 sum2 = b[*r++]; 904 sum3 = b[*r++]; 905 sum4 = b[*r++]; 906 sum5 = b[*r++]; 907 v2 = aa + ai[row+1]; 908 v3 = aa + ai[row+2]; 909 v4 = aa + ai[row+3]; 910 v5 = aa + ai[row+4]; 911 912 for (j=0; j<nz-1; j+=2) { 913 i0 = vi[0]; 914 i1 = vi[1]; 915 vi +=2; 916 tmp0 = tmps[i0]; 917 tmp1 = tmps[i1]; 918 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 919 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 920 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 921 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 922 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 923 } 924 if (j == nz-1) { 925 tmp0 = tmps[*vi++]; 926 sum1 -= *v1++ *tmp0; 927 sum2 -= *v2++ *tmp0; 928 sum3 -= *v3++ *tmp0; 929 sum4 -= *v4++ *tmp0; 930 sum5 -= *v5++ *tmp0; 931 } 932 933 sum2 -= *v2++ * sum1; 934 sum3 -= *v3++ * sum1; 935 sum4 -= *v4++ * sum1; 936 sum5 -= *v5++ * sum1; 937 sum3 -= *v3++ * sum2; 938 sum4 -= *v4++ * sum2; 939 sum5 -= *v5++ * sum2; 940 sum4 -= *v4++ * sum3; 941 sum5 -= *v5++ * sum3; 942 sum5 -= *v5++ * sum4; 943 944 tmp[row++]=sum1; 945 tmp[row++]=sum2; 946 tmp[row++]=sum3; 947 tmp[row++]=sum4; 948 tmp[row++]=sum5; 949 break; 950 default: 951 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 952 } 953 } 954 /* backward solve the upper triangular */ 955 for (i=node_max -1,row = n-1; i>=0; i--) { 956 nsz = ns[i]; 957 aii = ai[row+1] -1; 958 v1 = aa + aii; 959 vi = aj + aii; 960 nz = aii- ad[row]; 961 switch (nsz) { /* Each loop in 'case' is unrolled */ 962 case 1: 963 sum1 = tmp[row]; 964 965 for (j=nz; j>1; j-=2) { 966 vi -=2; 967 i0 = vi[2]; 968 i1 = vi[1]; 969 tmp0 = tmps[i0]; 970 tmp1 = tmps[i1]; 971 v1 -= 2; 972 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 973 } 974 if (j==1) { 975 tmp0 = tmps[*vi--]; 976 sum1 -= *v1-- * tmp0; 977 } 978 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 979 break; 980 case 2: 981 sum1 = tmp[row]; 982 sum2 = tmp[row -1]; 983 v2 = aa + ai[row]-1; 984 for (j=nz; j>1; j-=2) { 985 vi -=2; 986 i0 = vi[2]; 987 i1 = vi[1]; 988 tmp0 = tmps[i0]; 989 tmp1 = tmps[i1]; 990 v1 -= 2; 991 v2 -= 2; 992 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 993 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 994 } 995 if (j==1) { 996 tmp0 = tmps[*vi--]; 997 sum1 -= *v1-- * tmp0; 998 sum2 -= *v2-- * tmp0; 999 } 1000 1001 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1002 sum2 -= *v2-- * tmp0; 1003 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1004 break; 1005 case 3: 1006 sum1 = tmp[row]; 1007 sum2 = tmp[row -1]; 1008 sum3 = tmp[row -2]; 1009 v2 = aa + ai[row]-1; 1010 v3 = aa + ai[row -1]-1; 1011 for (j=nz; j>1; j-=2) { 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 v3 -= 2; 1020 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1021 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1022 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1023 } 1024 if (j==1) { 1025 tmp0 = tmps[*vi--]; 1026 sum1 -= *v1-- * tmp0; 1027 sum2 -= *v2-- * tmp0; 1028 sum3 -= *v3-- * tmp0; 1029 } 1030 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1031 sum2 -= *v2-- * tmp0; 1032 sum3 -= *v3-- * tmp0; 1033 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1034 sum3 -= *v3-- * tmp0; 1035 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1036 1037 break; 1038 case 4: 1039 sum1 = tmp[row]; 1040 sum2 = tmp[row -1]; 1041 sum3 = tmp[row -2]; 1042 sum4 = tmp[row -3]; 1043 v2 = aa + ai[row]-1; 1044 v3 = aa + ai[row -1]-1; 1045 v4 = aa + ai[row -2]-1; 1046 1047 for (j=nz; j>1; j-=2) { 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 v4 -= 2; 1057 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1058 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1059 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1060 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1061 } 1062 if (j==1) { 1063 tmp0 = tmps[*vi--]; 1064 sum1 -= *v1-- * tmp0; 1065 sum2 -= *v2-- * tmp0; 1066 sum3 -= *v3-- * tmp0; 1067 sum4 -= *v4-- * tmp0; 1068 } 1069 1070 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1071 sum2 -= *v2-- * tmp0; 1072 sum3 -= *v3-- * tmp0; 1073 sum4 -= *v4-- * tmp0; 1074 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1075 sum3 -= *v3-- * tmp0; 1076 sum4 -= *v4-- * tmp0; 1077 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1078 sum4 -= *v4-- * tmp0; 1079 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1080 break; 1081 case 5: 1082 sum1 = tmp[row]; 1083 sum2 = tmp[row -1]; 1084 sum3 = tmp[row -2]; 1085 sum4 = tmp[row -3]; 1086 sum5 = tmp[row -4]; 1087 v2 = aa + ai[row]-1; 1088 v3 = aa + ai[row -1]-1; 1089 v4 = aa + ai[row -2]-1; 1090 v5 = aa + ai[row -3]-1; 1091 for (j=nz; j>1; j-=2) { 1092 vi -= 2; 1093 i0 = vi[2]; 1094 i1 = vi[1]; 1095 tmp0 = tmps[i0]; 1096 tmp1 = tmps[i1]; 1097 v1 -= 2; 1098 v2 -= 2; 1099 v3 -= 2; 1100 v4 -= 2; 1101 v5 -= 2; 1102 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1103 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1104 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1105 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1106 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1107 } 1108 if (j==1) { 1109 tmp0 = tmps[*vi--]; 1110 sum1 -= *v1-- * tmp0; 1111 sum2 -= *v2-- * tmp0; 1112 sum3 -= *v3-- * tmp0; 1113 sum4 -= *v4-- * tmp0; 1114 sum5 -= *v5-- * tmp0; 1115 } 1116 1117 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1118 sum2 -= *v2-- * tmp0; 1119 sum3 -= *v3-- * tmp0; 1120 sum4 -= *v4-- * tmp0; 1121 sum5 -= *v5-- * tmp0; 1122 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1123 sum3 -= *v3-- * tmp0; 1124 sum4 -= *v4-- * tmp0; 1125 sum5 -= *v5-- * tmp0; 1126 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1127 sum4 -= *v4-- * tmp0; 1128 sum5 -= *v5-- * tmp0; 1129 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1130 sum5 -= *v5-- * tmp0; 1131 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1132 break; 1133 default: 1134 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1135 } 1136 } 1137 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1138 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1139 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1140 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1141 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1146 { 1147 Mat C =B; 1148 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1149 IS isrow = b->row,isicol = b->icol; 1150 PetscErrorCode ierr; 1151 const PetscInt *r,*ic,*ics; 1152 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1153 PetscInt i,j,k,nz,nzL,row,*pj; 1154 const PetscInt *ajtmp,*bjtmp; 1155 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1156 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1157 FactorShiftCtx sctx; 1158 const PetscInt *ddiag; 1159 PetscReal rs; 1160 MatScalar d; 1161 PetscInt inod,nodesz,node_max,col; 1162 const PetscInt *ns; 1163 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1164 1165 PetscFunctionBegin; 1166 /* MatPivotSetUp(): initialize shift context sctx */ 1167 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1168 1169 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1170 ddiag = a->diag; 1171 sctx.shift_top = info->zeropivot; 1172 for (i=0; i<n; i++) { 1173 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1174 d = (aa)[ddiag[i]]; 1175 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1176 v = aa+ai[i]; 1177 nz = ai[i+1] - ai[i]; 1178 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1179 if (rs>sctx.shift_top) sctx.shift_top = rs; 1180 } 1181 sctx.shift_top *= 1.1; 1182 sctx.nshift_max = 5; 1183 sctx.shift_lo = 0.; 1184 sctx.shift_hi = 1.; 1185 } 1186 1187 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1188 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1189 1190 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1191 ics = ic; 1192 1193 node_max = a->inode.node_count; 1194 ns = a->inode.size; 1195 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1196 1197 /* If max inode size > 4, split it into two inodes.*/ 1198 /* also map the inode sizes according to the ordering */ 1199 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1200 for (i=0,j=0; i<node_max; ++i,++j) { 1201 if (ns[i] > 4) { 1202 tmp_vec1[j] = 4; 1203 ++j; 1204 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1205 } else { 1206 tmp_vec1[j] = ns[i]; 1207 } 1208 } 1209 /* Use the correct node_max */ 1210 node_max = j; 1211 1212 /* Now reorder the inode info based on mat re-ordering info */ 1213 /* First create a row -> inode_size_array_index map */ 1214 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1215 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1216 for (i=0,row=0; i<node_max; i++) { 1217 nodesz = tmp_vec1[i]; 1218 for (j=0; j<nodesz; j++,row++) { 1219 nsmap[row] = i; 1220 } 1221 } 1222 /* Using nsmap, create a reordered ns structure */ 1223 for (i=0,j=0; i< node_max; i++) { 1224 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1225 tmp_vec2[i] = nodesz; 1226 j += nodesz; 1227 } 1228 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1229 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1230 1231 /* Now use the correct ns */ 1232 ns = tmp_vec2; 1233 1234 do { 1235 sctx.newshift = PETSC_FALSE; 1236 /* Now loop over each block-row, and do the factorization */ 1237 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1238 nodesz = ns[inod]; 1239 1240 switch (nodesz) { 1241 case 1: 1242 /*----------*/ 1243 /* zero rtmp1 */ 1244 /* L part */ 1245 nz = bi[i+1] - bi[i]; 1246 bjtmp = bj + bi[i]; 1247 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1248 1249 /* U part */ 1250 nz = bdiag[i]-bdiag[i+1]; 1251 bjtmp = bj + bdiag[i+1]+1; 1252 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1253 1254 /* load in initial (unfactored row) */ 1255 nz = ai[r[i]+1] - ai[r[i]]; 1256 ajtmp = aj + ai[r[i]]; 1257 v = aa + ai[r[i]]; 1258 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1259 1260 /* ZeropivotApply() */ 1261 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1262 1263 /* elimination */ 1264 bjtmp = bj + bi[i]; 1265 row = *bjtmp++; 1266 nzL = bi[i+1] - bi[i]; 1267 for (k=0; k < nzL; k++) { 1268 pc = rtmp1 + row; 1269 if (*pc != 0.0) { 1270 pv = b->a + bdiag[row]; 1271 mul1 = *pc * (*pv); 1272 *pc = mul1; 1273 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1274 pv = b->a + bdiag[row+1]+1; 1275 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1276 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1277 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1278 } 1279 row = *bjtmp++; 1280 } 1281 1282 /* finished row so stick it into b->a */ 1283 rs = 0.0; 1284 /* L part */ 1285 pv = b->a + bi[i]; 1286 pj = b->j + bi[i]; 1287 nz = bi[i+1] - bi[i]; 1288 for (j=0; j<nz; j++) { 1289 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1290 } 1291 1292 /* U part */ 1293 pv = b->a + bdiag[i+1]+1; 1294 pj = b->j + bdiag[i+1]+1; 1295 nz = bdiag[i] - bdiag[i+1]-1; 1296 for (j=0; j<nz; j++) { 1297 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1298 } 1299 1300 /* Check zero pivot */ 1301 sctx.rs = rs; 1302 sctx.pv = rtmp1[i]; 1303 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1304 if (sctx.newshift) break; 1305 1306 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1307 pv = b->a + bdiag[i]; 1308 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1309 break; 1310 1311 case 2: 1312 /*----------*/ 1313 /* zero rtmp1 and rtmp2 */ 1314 /* L part */ 1315 nz = bi[i+1] - bi[i]; 1316 bjtmp = bj + bi[i]; 1317 for (j=0; j<nz; j++) { 1318 col = bjtmp[j]; 1319 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1320 } 1321 1322 /* U part */ 1323 nz = bdiag[i]-bdiag[i+1]; 1324 bjtmp = bj + bdiag[i+1]+1; 1325 for (j=0; j<nz; j++) { 1326 col = bjtmp[j]; 1327 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1328 } 1329 1330 /* load in initial (unfactored row) */ 1331 nz = ai[r[i]+1] - ai[r[i]]; 1332 ajtmp = aj + ai[r[i]]; 1333 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1334 for (j=0; j<nz; j++) { 1335 col = ics[ajtmp[j]]; 1336 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1337 } 1338 /* ZeropivotApply(): shift the diagonal of the matrix */ 1339 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1340 1341 /* elimination */ 1342 bjtmp = bj + bi[i]; 1343 row = *bjtmp++; /* pivot row */ 1344 nzL = bi[i+1] - bi[i]; 1345 for (k=0; k < nzL; k++) { 1346 pc1 = rtmp1 + row; 1347 pc2 = rtmp2 + row; 1348 if (*pc1 != 0.0 || *pc2 != 0.0) { 1349 pv = b->a + bdiag[row]; 1350 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1351 *pc1 = mul1; *pc2 = mul2; 1352 1353 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1354 pv = b->a + bdiag[row+1]+1; 1355 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1356 for (j=0; j<nz; j++) { 1357 col = pj[j]; 1358 rtmp1[col] -= mul1 * pv[j]; 1359 rtmp2[col] -= mul2 * pv[j]; 1360 } 1361 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1362 } 1363 row = *bjtmp++; 1364 } 1365 1366 /* finished row i; check zero pivot, then stick row i into b->a */ 1367 rs = 0.0; 1368 /* L part */ 1369 pc1 = b->a + bi[i]; 1370 pj = b->j + bi[i]; 1371 nz = bi[i+1] - bi[i]; 1372 for (j=0; j<nz; j++) { 1373 col = pj[j]; 1374 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1375 } 1376 /* U part */ 1377 pc1 = b->a + bdiag[i+1]+1; 1378 pj = b->j + bdiag[i+1]+1; 1379 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1380 for (j=0; j<nz; j++) { 1381 col = pj[j]; 1382 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1383 } 1384 1385 sctx.rs = rs; 1386 sctx.pv = rtmp1[i]; 1387 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1388 if (sctx.newshift) break; 1389 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1390 *pc1 = 1.0/sctx.pv; 1391 1392 /* Now take care of diagonal 2x2 block. */ 1393 pc2 = rtmp2 + i; 1394 if (*pc2 != 0.0) { 1395 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1396 *pc2 = mul1; /* insert L entry */ 1397 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1398 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1399 for (j=0; j<nz; j++) { 1400 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1401 } 1402 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1403 } 1404 1405 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1406 rs = 0.0; 1407 /* L part */ 1408 pc2 = b->a + bi[i+1]; 1409 pj = b->j + bi[i+1]; 1410 nz = bi[i+2] - bi[i+1]; 1411 for (j=0; j<nz; j++) { 1412 col = pj[j]; 1413 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1414 } 1415 /* U part */ 1416 pc2 = b->a + bdiag[i+2]+1; 1417 pj = b->j + bdiag[i+2]+1; 1418 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1419 for (j=0; j<nz; j++) { 1420 col = pj[j]; 1421 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1422 } 1423 1424 sctx.rs = rs; 1425 sctx.pv = rtmp2[i+1]; 1426 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1427 if (sctx.newshift) break; 1428 pc2 = b->a + bdiag[i+1]; 1429 *pc2 = 1.0/sctx.pv; 1430 break; 1431 1432 case 3: 1433 /*----------*/ 1434 /* zero rtmp */ 1435 /* L part */ 1436 nz = bi[i+1] - bi[i]; 1437 bjtmp = bj + bi[i]; 1438 for (j=0; j<nz; j++) { 1439 col = bjtmp[j]; 1440 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1441 } 1442 1443 /* U part */ 1444 nz = bdiag[i]-bdiag[i+1]; 1445 bjtmp = bj + bdiag[i+1]+1; 1446 for (j=0; j<nz; j++) { 1447 col = bjtmp[j]; 1448 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1449 } 1450 1451 /* load in initial (unfactored row) */ 1452 nz = ai[r[i]+1] - ai[r[i]]; 1453 ajtmp = aj + ai[r[i]]; 1454 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1455 for (j=0; j<nz; j++) { 1456 col = ics[ajtmp[j]]; 1457 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1458 } 1459 /* ZeropivotApply(): shift the diagonal of the matrix */ 1460 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1461 1462 /* elimination */ 1463 bjtmp = bj + bi[i]; 1464 row = *bjtmp++; /* pivot row */ 1465 nzL = bi[i+1] - bi[i]; 1466 for (k=0; k < nzL; k++) { 1467 pc1 = rtmp1 + row; 1468 pc2 = rtmp2 + row; 1469 pc3 = rtmp3 + row; 1470 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1471 pv = b->a + bdiag[row]; 1472 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1473 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1474 1475 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1476 pv = b->a + bdiag[row+1]+1; 1477 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1478 for (j=0; j<nz; j++) { 1479 col = pj[j]; 1480 rtmp1[col] -= mul1 * pv[j]; 1481 rtmp2[col] -= mul2 * pv[j]; 1482 rtmp3[col] -= mul3 * pv[j]; 1483 } 1484 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1485 } 1486 row = *bjtmp++; 1487 } 1488 1489 /* finished row i; check zero pivot, then stick row i into b->a */ 1490 rs = 0.0; 1491 /* L part */ 1492 pc1 = b->a + bi[i]; 1493 pj = b->j + bi[i]; 1494 nz = bi[i+1] - bi[i]; 1495 for (j=0; j<nz; j++) { 1496 col = pj[j]; 1497 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1498 } 1499 /* U part */ 1500 pc1 = b->a + bdiag[i+1]+1; 1501 pj = b->j + bdiag[i+1]+1; 1502 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1503 for (j=0; j<nz; j++) { 1504 col = pj[j]; 1505 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1506 } 1507 1508 sctx.rs = rs; 1509 sctx.pv = rtmp1[i]; 1510 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1511 if (sctx.newshift) break; 1512 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1513 *pc1 = 1.0/sctx.pv; 1514 1515 /* Now take care of 1st column of diagonal 3x3 block. */ 1516 pc2 = rtmp2 + i; 1517 pc3 = rtmp3 + i; 1518 if (*pc2 != 0.0 || *pc3 != 0.0) { 1519 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1520 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1521 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1522 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1523 for (j=0; j<nz; j++) { 1524 col = pj[j]; 1525 rtmp2[col] -= mul2 * rtmp1[col]; 1526 rtmp3[col] -= mul3 * rtmp1[col]; 1527 } 1528 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1529 } 1530 1531 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1532 rs = 0.0; 1533 /* L part */ 1534 pc2 = b->a + bi[i+1]; 1535 pj = b->j + bi[i+1]; 1536 nz = bi[i+2] - bi[i+1]; 1537 for (j=0; j<nz; j++) { 1538 col = pj[j]; 1539 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1540 } 1541 /* U part */ 1542 pc2 = b->a + bdiag[i+2]+1; 1543 pj = b->j + bdiag[i+2]+1; 1544 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1545 for (j=0; j<nz; j++) { 1546 col = pj[j]; 1547 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1548 } 1549 1550 sctx.rs = rs; 1551 sctx.pv = rtmp2[i+1]; 1552 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1553 if (sctx.newshift) break; 1554 pc2 = b->a + bdiag[i+1]; 1555 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1556 1557 /* Now take care of 2nd column of diagonal 3x3 block. */ 1558 pc3 = rtmp3 + i+1; 1559 if (*pc3 != 0.0) { 1560 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1561 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1562 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1563 for (j=0; j<nz; j++) { 1564 col = pj[j]; 1565 rtmp3[col] -= mul3 * rtmp2[col]; 1566 } 1567 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1568 } 1569 1570 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1571 rs = 0.0; 1572 /* L part */ 1573 pc3 = b->a + bi[i+2]; 1574 pj = b->j + bi[i+2]; 1575 nz = bi[i+3] - bi[i+2]; 1576 for (j=0; j<nz; j++) { 1577 col = pj[j]; 1578 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1579 } 1580 /* U part */ 1581 pc3 = b->a + bdiag[i+3]+1; 1582 pj = b->j + bdiag[i+3]+1; 1583 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1584 for (j=0; j<nz; j++) { 1585 col = pj[j]; 1586 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1587 } 1588 1589 sctx.rs = rs; 1590 sctx.pv = rtmp3[i+2]; 1591 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1592 if (sctx.newshift) break; 1593 pc3 = b->a + bdiag[i+2]; 1594 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1595 break; 1596 case 4: 1597 /*----------*/ 1598 /* zero rtmp */ 1599 /* L part */ 1600 nz = bi[i+1] - bi[i]; 1601 bjtmp = bj + bi[i]; 1602 for (j=0; j<nz; j++) { 1603 col = bjtmp[j]; 1604 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1605 } 1606 1607 /* U part */ 1608 nz = bdiag[i]-bdiag[i+1]; 1609 bjtmp = bj + bdiag[i+1]+1; 1610 for (j=0; j<nz; j++) { 1611 col = bjtmp[j]; 1612 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1613 } 1614 1615 /* load in initial (unfactored row) */ 1616 nz = ai[r[i]+1] - ai[r[i]]; 1617 ajtmp = aj + ai[r[i]]; 1618 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1619 for (j=0; j<nz; j++) { 1620 col = ics[ajtmp[j]]; 1621 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1622 } 1623 /* ZeropivotApply(): shift the diagonal of the matrix */ 1624 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1625 1626 /* elimination */ 1627 bjtmp = bj + bi[i]; 1628 row = *bjtmp++; /* pivot row */ 1629 nzL = bi[i+1] - bi[i]; 1630 for (k=0; k < nzL; k++) { 1631 pc1 = rtmp1 + row; 1632 pc2 = rtmp2 + row; 1633 pc3 = rtmp3 + row; 1634 pc4 = rtmp4 + row; 1635 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1636 pv = b->a + bdiag[row]; 1637 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1638 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1639 1640 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1641 pv = b->a + bdiag[row+1]+1; 1642 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1643 for (j=0; j<nz; j++) { 1644 col = pj[j]; 1645 rtmp1[col] -= mul1 * pv[j]; 1646 rtmp2[col] -= mul2 * pv[j]; 1647 rtmp3[col] -= mul3 * pv[j]; 1648 rtmp4[col] -= mul4 * pv[j]; 1649 } 1650 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1651 } 1652 row = *bjtmp++; 1653 } 1654 1655 /* finished row i; check zero pivot, then stick row i into b->a */ 1656 rs = 0.0; 1657 /* L part */ 1658 pc1 = b->a + bi[i]; 1659 pj = b->j + bi[i]; 1660 nz = bi[i+1] - bi[i]; 1661 for (j=0; j<nz; j++) { 1662 col = pj[j]; 1663 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1664 } 1665 /* U part */ 1666 pc1 = b->a + bdiag[i+1]+1; 1667 pj = b->j + bdiag[i+1]+1; 1668 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1669 for (j=0; j<nz; j++) { 1670 col = pj[j]; 1671 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1672 } 1673 1674 sctx.rs = rs; 1675 sctx.pv = rtmp1[i]; 1676 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1677 if (sctx.newshift) break; 1678 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1679 *pc1 = 1.0/sctx.pv; 1680 1681 /* Now take care of 1st column of diagonal 4x4 block. */ 1682 pc2 = rtmp2 + i; 1683 pc3 = rtmp3 + i; 1684 pc4 = rtmp4 + i; 1685 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1686 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1687 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1688 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1689 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1690 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1691 for (j=0; j<nz; j++) { 1692 col = pj[j]; 1693 rtmp2[col] -= mul2 * rtmp1[col]; 1694 rtmp3[col] -= mul3 * rtmp1[col]; 1695 rtmp4[col] -= mul4 * rtmp1[col]; 1696 } 1697 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1698 } 1699 1700 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1701 rs = 0.0; 1702 /* L part */ 1703 pc2 = b->a + bi[i+1]; 1704 pj = b->j + bi[i+1]; 1705 nz = bi[i+2] - bi[i+1]; 1706 for (j=0; j<nz; j++) { 1707 col = pj[j]; 1708 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1709 } 1710 /* U part */ 1711 pc2 = b->a + bdiag[i+2]+1; 1712 pj = b->j + bdiag[i+2]+1; 1713 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1714 for (j=0; j<nz; j++) { 1715 col = pj[j]; 1716 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1717 } 1718 1719 sctx.rs = rs; 1720 sctx.pv = rtmp2[i+1]; 1721 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1722 if (sctx.newshift) break; 1723 pc2 = b->a + bdiag[i+1]; 1724 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1725 1726 /* Now take care of 2nd column of diagonal 4x4 block. */ 1727 pc3 = rtmp3 + i+1; 1728 pc4 = rtmp4 + i+1; 1729 if (*pc3 != 0.0 || *pc4 != 0.0) { 1730 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1731 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1732 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1733 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1734 for (j=0; j<nz; j++) { 1735 col = pj[j]; 1736 rtmp3[col] -= mul3 * rtmp2[col]; 1737 rtmp4[col] -= mul4 * rtmp2[col]; 1738 } 1739 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1740 } 1741 1742 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1743 rs = 0.0; 1744 /* L part */ 1745 pc3 = b->a + bi[i+2]; 1746 pj = b->j + bi[i+2]; 1747 nz = bi[i+3] - bi[i+2]; 1748 for (j=0; j<nz; j++) { 1749 col = pj[j]; 1750 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1751 } 1752 /* U part */ 1753 pc3 = b->a + bdiag[i+3]+1; 1754 pj = b->j + bdiag[i+3]+1; 1755 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1756 for (j=0; j<nz; j++) { 1757 col = pj[j]; 1758 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1759 } 1760 1761 sctx.rs = rs; 1762 sctx.pv = rtmp3[i+2]; 1763 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1764 if (sctx.newshift) break; 1765 pc3 = b->a + bdiag[i+2]; 1766 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1767 1768 /* Now take care of 3rd column of diagonal 4x4 block. */ 1769 pc4 = rtmp4 + i+2; 1770 if (*pc4 != 0.0) { 1771 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1772 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1773 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1774 for (j=0; j<nz; j++) { 1775 col = pj[j]; 1776 rtmp4[col] -= mul4 * rtmp3[col]; 1777 } 1778 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1779 } 1780 1781 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1782 rs = 0.0; 1783 /* L part */ 1784 pc4 = b->a + bi[i+3]; 1785 pj = b->j + bi[i+3]; 1786 nz = bi[i+4] - bi[i+3]; 1787 for (j=0; j<nz; j++) { 1788 col = pj[j]; 1789 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1790 } 1791 /* U part */ 1792 pc4 = b->a + bdiag[i+4]+1; 1793 pj = b->j + bdiag[i+4]+1; 1794 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1795 for (j=0; j<nz; j++) { 1796 col = pj[j]; 1797 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1798 } 1799 1800 sctx.rs = rs; 1801 sctx.pv = rtmp4[i+3]; 1802 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1803 if (sctx.newshift) break; 1804 pc4 = b->a + bdiag[i+3]; 1805 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1806 break; 1807 1808 default: 1809 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1810 } 1811 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1812 i += nodesz; /* Update the row */ 1813 } 1814 1815 /* MatPivotRefine() */ 1816 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1817 /* 1818 * if no shift in this attempt & shifting & started shifting & can refine, 1819 * then try lower shift 1820 */ 1821 sctx.shift_hi = sctx.shift_fraction; 1822 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1823 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1824 sctx.newshift = PETSC_TRUE; 1825 sctx.nshift++; 1826 } 1827 } while (sctx.newshift); 1828 1829 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1830 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1831 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1832 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1833 1834 if (b->inode.size) { 1835 C->ops->solve = MatSolve_SeqAIJ_Inode; 1836 } else { 1837 C->ops->solve = MatSolve_SeqAIJ; 1838 } 1839 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1840 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1841 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1842 C->ops->matsolve = MatMatSolve_SeqAIJ; 1843 C->assembled = PETSC_TRUE; 1844 C->preallocated = PETSC_TRUE; 1845 1846 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1847 1848 /* MatShiftView(A,info,&sctx) */ 1849 if (sctx.nshift) { 1850 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1851 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1852 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1853 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1854 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1855 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1856 } 1857 } 1858 PetscFunctionReturn(0); 1859 } 1860 1861 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1862 { 1863 Mat C = B; 1864 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1865 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1866 PetscErrorCode ierr; 1867 const PetscInt *r,*ic,*c,*ics; 1868 PetscInt n = A->rmap->n,*bi = b->i; 1869 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1870 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1871 PetscInt *ai = a->i,*aj = a->j; 1872 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1873 PetscScalar mul1,mul2,mul3,tmp; 1874 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1875 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1876 PetscReal rs=0.0; 1877 FactorShiftCtx sctx; 1878 1879 PetscFunctionBegin; 1880 sctx.shift_top = 0; 1881 sctx.nshift_max = 0; 1882 sctx.shift_lo = 0; 1883 sctx.shift_hi = 0; 1884 sctx.shift_fraction = 0; 1885 1886 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1887 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1888 sctx.shift_top = 0; 1889 for (i=0; i<n; i++) { 1890 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1891 rs = 0.0; 1892 ajtmp = aj + ai[i]; 1893 rtmp1 = aa + ai[i]; 1894 nz = ai[i+1] - ai[i]; 1895 for (j=0; j<nz; j++) { 1896 if (*ajtmp != i) { 1897 rs += PetscAbsScalar(*rtmp1++); 1898 } else { 1899 rs -= PetscRealPart(*rtmp1++); 1900 } 1901 ajtmp++; 1902 } 1903 if (rs>sctx.shift_top) sctx.shift_top = rs; 1904 } 1905 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1906 sctx.shift_top *= 1.1; 1907 sctx.nshift_max = 5; 1908 sctx.shift_lo = 0.; 1909 sctx.shift_hi = 1.; 1910 } 1911 sctx.shift_amount = 0; 1912 sctx.nshift = 0; 1913 1914 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1915 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1916 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1917 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1918 ics = ic; 1919 1920 node_max = a->inode.node_count; 1921 ns = a->inode.size; 1922 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1923 1924 /* If max inode size > 3, split it into two inodes.*/ 1925 /* also map the inode sizes according to the ordering */ 1926 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1927 for (i=0,j=0; i<node_max; ++i,++j) { 1928 if (ns[i]>3) { 1929 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1930 ++j; 1931 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1932 } else { 1933 tmp_vec1[j] = ns[i]; 1934 } 1935 } 1936 /* Use the correct node_max */ 1937 node_max = j; 1938 1939 /* Now reorder the inode info based on mat re-ordering info */ 1940 /* First create a row -> inode_size_array_index map */ 1941 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1942 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1943 for (i=0,row=0; i<node_max; i++) { 1944 nodesz = tmp_vec1[i]; 1945 for (j=0; j<nodesz; j++,row++) { 1946 nsmap[row] = i; 1947 } 1948 } 1949 /* Using nsmap, create a reordered ns structure */ 1950 for (i=0,j=0; i< node_max; i++) { 1951 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1952 tmp_vec2[i] = nodesz; 1953 j += nodesz; 1954 } 1955 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1956 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1957 /* Now use the correct ns */ 1958 ns = tmp_vec2; 1959 1960 do { 1961 sctx.newshift = PETSC_FALSE; 1962 /* Now loop over each block-row, and do the factorization */ 1963 for (i=0,row=0; i<node_max; i++) { 1964 nodesz = ns[i]; 1965 nz = bi[row+1] - bi[row]; 1966 bjtmp = bj + bi[row]; 1967 1968 switch (nodesz) { 1969 case 1: 1970 for (j=0; j<nz; j++) { 1971 idx = bjtmp[j]; 1972 rtmp11[idx] = 0.0; 1973 } 1974 1975 /* load in initial (unfactored row) */ 1976 idx = r[row]; 1977 nz_tmp = ai[idx+1] - ai[idx]; 1978 ajtmp = aj + ai[idx]; 1979 v1 = aa + ai[idx]; 1980 1981 for (j=0; j<nz_tmp; j++) { 1982 idx = ics[ajtmp[j]]; 1983 rtmp11[idx] = v1[j]; 1984 } 1985 rtmp11[ics[r[row]]] += sctx.shift_amount; 1986 1987 prow = *bjtmp++; 1988 while (prow < row) { 1989 pc1 = rtmp11 + prow; 1990 if (*pc1 != 0.0) { 1991 pv = ba + bd[prow]; 1992 pj = nbj + bd[prow]; 1993 mul1 = *pc1 * *pv++; 1994 *pc1 = mul1; 1995 nz_tmp = bi[prow+1] - bd[prow] - 1; 1996 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 1997 for (j=0; j<nz_tmp; j++) { 1998 tmp = pv[j]; 1999 idx = pj[j]; 2000 rtmp11[idx] -= mul1 * tmp; 2001 } 2002 } 2003 prow = *bjtmp++; 2004 } 2005 pj = bj + bi[row]; 2006 pc1 = ba + bi[row]; 2007 2008 sctx.pv = rtmp11[row]; 2009 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2010 rs = 0.0; 2011 for (j=0; j<nz; j++) { 2012 idx = pj[j]; 2013 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2014 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2015 } 2016 sctx.rs = rs; 2017 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2018 if (sctx.newshift) goto endofwhile; 2019 break; 2020 2021 case 2: 2022 for (j=0; j<nz; j++) { 2023 idx = bjtmp[j]; 2024 rtmp11[idx] = 0.0; 2025 rtmp22[idx] = 0.0; 2026 } 2027 2028 /* load in initial (unfactored row) */ 2029 idx = r[row]; 2030 nz_tmp = ai[idx+1] - ai[idx]; 2031 ajtmp = aj + ai[idx]; 2032 v1 = aa + ai[idx]; 2033 v2 = aa + ai[idx+1]; 2034 for (j=0; j<nz_tmp; j++) { 2035 idx = ics[ajtmp[j]]; 2036 rtmp11[idx] = v1[j]; 2037 rtmp22[idx] = v2[j]; 2038 } 2039 rtmp11[ics[r[row]]] += sctx.shift_amount; 2040 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2041 2042 prow = *bjtmp++; 2043 while (prow < row) { 2044 pc1 = rtmp11 + prow; 2045 pc2 = rtmp22 + prow; 2046 if (*pc1 != 0.0 || *pc2 != 0.0) { 2047 pv = ba + bd[prow]; 2048 pj = nbj + bd[prow]; 2049 mul1 = *pc1 * *pv; 2050 mul2 = *pc2 * *pv; 2051 ++pv; 2052 *pc1 = mul1; 2053 *pc2 = mul2; 2054 2055 nz_tmp = bi[prow+1] - bd[prow] - 1; 2056 for (j=0; j<nz_tmp; j++) { 2057 tmp = pv[j]; 2058 idx = pj[j]; 2059 rtmp11[idx] -= mul1 * tmp; 2060 rtmp22[idx] -= mul2 * tmp; 2061 } 2062 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2063 } 2064 prow = *bjtmp++; 2065 } 2066 2067 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2068 pc1 = rtmp11 + prow; 2069 pc2 = rtmp22 + prow; 2070 2071 sctx.pv = *pc1; 2072 pj = bj + bi[prow]; 2073 rs = 0.0; 2074 for (j=0; j<nz; j++) { 2075 idx = pj[j]; 2076 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2077 } 2078 sctx.rs = rs; 2079 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2080 if (sctx.newshift) goto endofwhile; 2081 2082 if (*pc2 != 0.0) { 2083 pj = nbj + bd[prow]; 2084 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2085 *pc2 = mul2; 2086 nz_tmp = bi[prow+1] - bd[prow] - 1; 2087 for (j=0; j<nz_tmp; j++) { 2088 idx = pj[j]; 2089 tmp = rtmp11[idx]; 2090 rtmp22[idx] -= mul2 * tmp; 2091 } 2092 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2093 } 2094 2095 pj = bj + bi[row]; 2096 pc1 = ba + bi[row]; 2097 pc2 = ba + bi[row+1]; 2098 2099 sctx.pv = rtmp22[row+1]; 2100 rs = 0.0; 2101 rtmp11[row] = 1.0/rtmp11[row]; 2102 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2103 /* copy row entries from dense representation to sparse */ 2104 for (j=0; j<nz; j++) { 2105 idx = pj[j]; 2106 pc1[j] = rtmp11[idx]; 2107 pc2[j] = rtmp22[idx]; 2108 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2109 } 2110 sctx.rs = rs; 2111 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2112 if (sctx.newshift) goto endofwhile; 2113 break; 2114 2115 case 3: 2116 for (j=0; j<nz; j++) { 2117 idx = bjtmp[j]; 2118 rtmp11[idx] = 0.0; 2119 rtmp22[idx] = 0.0; 2120 rtmp33[idx] = 0.0; 2121 } 2122 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2123 idx = r[row]; 2124 nz_tmp = ai[idx+1] - ai[idx]; 2125 ajtmp = aj + ai[idx]; 2126 v1 = aa + ai[idx]; 2127 v2 = aa + ai[idx+1]; 2128 v3 = aa + ai[idx+2]; 2129 for (j=0; j<nz_tmp; j++) { 2130 idx = ics[ajtmp[j]]; 2131 rtmp11[idx] = v1[j]; 2132 rtmp22[idx] = v2[j]; 2133 rtmp33[idx] = v3[j]; 2134 } 2135 rtmp11[ics[r[row]]] += sctx.shift_amount; 2136 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2137 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2138 2139 /* loop over all pivot row blocks above this row block */ 2140 prow = *bjtmp++; 2141 while (prow < row) { 2142 pc1 = rtmp11 + prow; 2143 pc2 = rtmp22 + prow; 2144 pc3 = rtmp33 + prow; 2145 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2146 pv = ba + bd[prow]; 2147 pj = nbj + bd[prow]; 2148 mul1 = *pc1 * *pv; 2149 mul2 = *pc2 * *pv; 2150 mul3 = *pc3 * *pv; 2151 ++pv; 2152 *pc1 = mul1; 2153 *pc2 = mul2; 2154 *pc3 = mul3; 2155 2156 nz_tmp = bi[prow+1] - bd[prow] - 1; 2157 /* update this row based on pivot row */ 2158 for (j=0; j<nz_tmp; j++) { 2159 tmp = pv[j]; 2160 idx = pj[j]; 2161 rtmp11[idx] -= mul1 * tmp; 2162 rtmp22[idx] -= mul2 * tmp; 2163 rtmp33[idx] -= mul3 * tmp; 2164 } 2165 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2166 } 2167 prow = *bjtmp++; 2168 } 2169 2170 /* Now take care of diagonal 3x3 block in this set of rows */ 2171 /* note: prow = row here */ 2172 pc1 = rtmp11 + prow; 2173 pc2 = rtmp22 + prow; 2174 pc3 = rtmp33 + prow; 2175 2176 sctx.pv = *pc1; 2177 pj = bj + bi[prow]; 2178 rs = 0.0; 2179 for (j=0; j<nz; j++) { 2180 idx = pj[j]; 2181 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2182 } 2183 sctx.rs = rs; 2184 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2185 if (sctx.newshift) goto endofwhile; 2186 2187 if (*pc2 != 0.0 || *pc3 != 0.0) { 2188 mul2 = (*pc2)/(*pc1); 2189 mul3 = (*pc3)/(*pc1); 2190 *pc2 = mul2; 2191 *pc3 = mul3; 2192 nz_tmp = bi[prow+1] - bd[prow] - 1; 2193 pj = nbj + bd[prow]; 2194 for (j=0; j<nz_tmp; j++) { 2195 idx = pj[j]; 2196 tmp = rtmp11[idx]; 2197 rtmp22[idx] -= mul2 * tmp; 2198 rtmp33[idx] -= mul3 * tmp; 2199 } 2200 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2201 } 2202 ++prow; 2203 2204 pc2 = rtmp22 + prow; 2205 pc3 = rtmp33 + prow; 2206 sctx.pv = *pc2; 2207 pj = bj + bi[prow]; 2208 rs = 0.0; 2209 for (j=0; j<nz; j++) { 2210 idx = pj[j]; 2211 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2212 } 2213 sctx.rs = rs; 2214 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2215 if (sctx.newshift) goto endofwhile; 2216 2217 if (*pc3 != 0.0) { 2218 mul3 = (*pc3)/(*pc2); 2219 *pc3 = mul3; 2220 pj = nbj + bd[prow]; 2221 nz_tmp = bi[prow+1] - bd[prow] - 1; 2222 for (j=0; j<nz_tmp; j++) { 2223 idx = pj[j]; 2224 tmp = rtmp22[idx]; 2225 rtmp33[idx] -= mul3 * tmp; 2226 } 2227 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2228 } 2229 2230 pj = bj + bi[row]; 2231 pc1 = ba + bi[row]; 2232 pc2 = ba + bi[row+1]; 2233 pc3 = ba + bi[row+2]; 2234 2235 sctx.pv = rtmp33[row+2]; 2236 rs = 0.0; 2237 rtmp11[row] = 1.0/rtmp11[row]; 2238 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2239 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2240 /* copy row entries from dense representation to sparse */ 2241 for (j=0; j<nz; j++) { 2242 idx = pj[j]; 2243 pc1[j] = rtmp11[idx]; 2244 pc2[j] = rtmp22[idx]; 2245 pc3[j] = rtmp33[idx]; 2246 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2247 } 2248 2249 sctx.rs = rs; 2250 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2251 if (sctx.newshift) goto endofwhile; 2252 break; 2253 2254 default: 2255 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2256 } 2257 row += nodesz; /* Update the row */ 2258 } 2259 endofwhile:; 2260 } while (sctx.newshift); 2261 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2262 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2263 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2264 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2265 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2266 2267 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2268 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2269 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2270 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2271 C->assembled = PETSC_TRUE; 2272 C->preallocated = PETSC_TRUE; 2273 if (sctx.nshift) { 2274 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2275 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2276 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2277 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2278 } 2279 } 2280 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2281 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2282 PetscFunctionReturn(0); 2283 } 2284 2285 2286 /* ----------------------------------------------------------- */ 2287 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2288 { 2289 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2290 IS iscol = a->col,isrow = a->row; 2291 PetscErrorCode ierr; 2292 const PetscInt *r,*c,*rout,*cout; 2293 PetscInt i,j,n = A->rmap->n; 2294 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2295 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2296 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2297 PetscScalar sum1,sum2,sum3,sum4,sum5; 2298 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2299 const PetscScalar *b; 2300 2301 PetscFunctionBegin; 2302 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2303 node_max = a->inode.node_count; 2304 ns = a->inode.size; /* Node Size array */ 2305 2306 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2307 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2308 tmp = a->solve_work; 2309 2310 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2311 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2312 2313 /* forward solve the lower triangular */ 2314 tmps = tmp; 2315 aa = a_a; 2316 aj = a_j; 2317 ad = a->diag; 2318 2319 for (i = 0,row = 0; i< node_max; ++i) { 2320 nsz = ns[i]; 2321 aii = ai[row]; 2322 v1 = aa + aii; 2323 vi = aj + aii; 2324 nz = ai[row+1]- ai[row]; 2325 2326 if (i < node_max-1) { 2327 /* Prefetch the indices for the next block */ 2328 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2329 /* Prefetch the data for the next block */ 2330 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2331 } 2332 2333 switch (nsz) { /* Each loop in 'case' is unrolled */ 2334 case 1: 2335 sum1 = b[r[row]]; 2336 for (j=0; j<nz-1; j+=2) { 2337 i0 = vi[j]; 2338 i1 = vi[j+1]; 2339 tmp0 = tmps[i0]; 2340 tmp1 = tmps[i1]; 2341 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2342 } 2343 if (j == nz-1) { 2344 tmp0 = tmps[vi[j]]; 2345 sum1 -= v1[j]*tmp0; 2346 } 2347 tmp[row++]=sum1; 2348 break; 2349 case 2: 2350 sum1 = b[r[row]]; 2351 sum2 = b[r[row+1]]; 2352 v2 = aa + ai[row+1]; 2353 2354 for (j=0; j<nz-1; j+=2) { 2355 i0 = vi[j]; 2356 i1 = vi[j+1]; 2357 tmp0 = tmps[i0]; 2358 tmp1 = tmps[i1]; 2359 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2360 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2361 } 2362 if (j == nz-1) { 2363 tmp0 = tmps[vi[j]]; 2364 sum1 -= v1[j] *tmp0; 2365 sum2 -= v2[j] *tmp0; 2366 } 2367 sum2 -= v2[nz] * sum1; 2368 tmp[row++]=sum1; 2369 tmp[row++]=sum2; 2370 break; 2371 case 3: 2372 sum1 = b[r[row]]; 2373 sum2 = b[r[row+1]]; 2374 sum3 = b[r[row+2]]; 2375 v2 = aa + ai[row+1]; 2376 v3 = aa + ai[row+2]; 2377 2378 for (j=0; j<nz-1; j+=2) { 2379 i0 = vi[j]; 2380 i1 = vi[j+1]; 2381 tmp0 = tmps[i0]; 2382 tmp1 = tmps[i1]; 2383 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2384 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2385 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2386 } 2387 if (j == nz-1) { 2388 tmp0 = tmps[vi[j]]; 2389 sum1 -= v1[j] *tmp0; 2390 sum2 -= v2[j] *tmp0; 2391 sum3 -= v3[j] *tmp0; 2392 } 2393 sum2 -= v2[nz] * sum1; 2394 sum3 -= v3[nz] * sum1; 2395 sum3 -= v3[nz+1] * sum2; 2396 tmp[row++]=sum1; 2397 tmp[row++]=sum2; 2398 tmp[row++]=sum3; 2399 break; 2400 2401 case 4: 2402 sum1 = b[r[row]]; 2403 sum2 = b[r[row+1]]; 2404 sum3 = b[r[row+2]]; 2405 sum4 = b[r[row+3]]; 2406 v2 = aa + ai[row+1]; 2407 v3 = aa + ai[row+2]; 2408 v4 = aa + ai[row+3]; 2409 2410 for (j=0; j<nz-1; j+=2) { 2411 i0 = vi[j]; 2412 i1 = vi[j+1]; 2413 tmp0 = tmps[i0]; 2414 tmp1 = tmps[i1]; 2415 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2416 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2417 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2418 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2419 } 2420 if (j == nz-1) { 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] *tmp0; 2423 sum2 -= v2[j] *tmp0; 2424 sum3 -= v3[j] *tmp0; 2425 sum4 -= v4[j] *tmp0; 2426 } 2427 sum2 -= v2[nz] * sum1; 2428 sum3 -= v3[nz] * sum1; 2429 sum4 -= v4[nz] * sum1; 2430 sum3 -= v3[nz+1] * sum2; 2431 sum4 -= v4[nz+1] * sum2; 2432 sum4 -= v4[nz+2] * sum3; 2433 2434 tmp[row++]=sum1; 2435 tmp[row++]=sum2; 2436 tmp[row++]=sum3; 2437 tmp[row++]=sum4; 2438 break; 2439 case 5: 2440 sum1 = b[r[row]]; 2441 sum2 = b[r[row+1]]; 2442 sum3 = b[r[row+2]]; 2443 sum4 = b[r[row+3]]; 2444 sum5 = b[r[row+4]]; 2445 v2 = aa + ai[row+1]; 2446 v3 = aa + ai[row+2]; 2447 v4 = aa + ai[row+3]; 2448 v5 = aa + ai[row+4]; 2449 2450 for (j=0; j<nz-1; j+=2) { 2451 i0 = vi[j]; 2452 i1 = vi[j+1]; 2453 tmp0 = tmps[i0]; 2454 tmp1 = tmps[i1]; 2455 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2456 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2457 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2458 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2459 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2460 } 2461 if (j == nz-1) { 2462 tmp0 = tmps[vi[j]]; 2463 sum1 -= v1[j] *tmp0; 2464 sum2 -= v2[j] *tmp0; 2465 sum3 -= v3[j] *tmp0; 2466 sum4 -= v4[j] *tmp0; 2467 sum5 -= v5[j] *tmp0; 2468 } 2469 2470 sum2 -= v2[nz] * sum1; 2471 sum3 -= v3[nz] * sum1; 2472 sum4 -= v4[nz] * sum1; 2473 sum5 -= v5[nz] * sum1; 2474 sum3 -= v3[nz+1] * sum2; 2475 sum4 -= v4[nz+1] * sum2; 2476 sum5 -= v5[nz+1] * sum2; 2477 sum4 -= v4[nz+2] * sum3; 2478 sum5 -= v5[nz+2] * sum3; 2479 sum5 -= v5[nz+3] * sum4; 2480 2481 tmp[row++]=sum1; 2482 tmp[row++]=sum2; 2483 tmp[row++]=sum3; 2484 tmp[row++]=sum4; 2485 tmp[row++]=sum5; 2486 break; 2487 default: 2488 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2489 } 2490 } 2491 /* backward solve the upper triangular */ 2492 for (i=node_max -1,row = n-1; i>=0; i--) { 2493 nsz = ns[i]; 2494 aii = ad[row+1] + 1; 2495 v1 = aa + aii; 2496 vi = aj + aii; 2497 nz = ad[row]- ad[row+1] - 1; 2498 2499 if (i > 0) { 2500 /* Prefetch the indices for the next block */ 2501 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2502 /* Prefetch the data for the next block */ 2503 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2504 } 2505 2506 switch (nsz) { /* Each loop in 'case' is unrolled */ 2507 case 1: 2508 sum1 = tmp[row]; 2509 2510 for (j=0; j<nz-1; j+=2) { 2511 i0 = vi[j]; 2512 i1 = vi[j+1]; 2513 tmp0 = tmps[i0]; 2514 tmp1 = tmps[i1]; 2515 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2516 } 2517 if (j == nz-1) { 2518 tmp0 = tmps[vi[j]]; 2519 sum1 -= v1[j]*tmp0; 2520 } 2521 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2522 break; 2523 case 2: 2524 sum1 = tmp[row]; 2525 sum2 = tmp[row-1]; 2526 v2 = aa + ad[row] + 1; 2527 for (j=0; j<nz-1; j+=2) { 2528 i0 = vi[j]; 2529 i1 = vi[j+1]; 2530 tmp0 = tmps[i0]; 2531 tmp1 = tmps[i1]; 2532 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2533 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2534 } 2535 if (j == nz-1) { 2536 tmp0 = tmps[vi[j]]; 2537 sum1 -= v1[j]* tmp0; 2538 sum2 -= v2[j+1]* tmp0; 2539 } 2540 2541 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2542 sum2 -= v2[0] * tmp0; 2543 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2544 break; 2545 case 3: 2546 sum1 = tmp[row]; 2547 sum2 = tmp[row -1]; 2548 sum3 = tmp[row -2]; 2549 v2 = aa + ad[row] + 1; 2550 v3 = aa + ad[row -1] + 1; 2551 for (j=0; j<nz-1; j+=2) { 2552 i0 = vi[j]; 2553 i1 = vi[j+1]; 2554 tmp0 = tmps[i0]; 2555 tmp1 = tmps[i1]; 2556 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2557 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2558 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2559 } 2560 if (j== nz-1) { 2561 tmp0 = tmps[vi[j]]; 2562 sum1 -= v1[j] * tmp0; 2563 sum2 -= v2[j+1] * tmp0; 2564 sum3 -= v3[j+2] * tmp0; 2565 } 2566 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2567 sum2 -= v2[0]* tmp0; 2568 sum3 -= v3[1] * tmp0; 2569 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2570 sum3 -= v3[0]* tmp0; 2571 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2572 2573 break; 2574 case 4: 2575 sum1 = tmp[row]; 2576 sum2 = tmp[row -1]; 2577 sum3 = tmp[row -2]; 2578 sum4 = tmp[row -3]; 2579 v2 = aa + ad[row]+1; 2580 v3 = aa + ad[row -1]+1; 2581 v4 = aa + ad[row -2]+1; 2582 2583 for (j=0; j<nz-1; j+=2) { 2584 i0 = vi[j]; 2585 i1 = vi[j+1]; 2586 tmp0 = tmps[i0]; 2587 tmp1 = tmps[i1]; 2588 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2589 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2590 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2591 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2592 } 2593 if (j== nz-1) { 2594 tmp0 = tmps[vi[j]]; 2595 sum1 -= v1[j] * tmp0; 2596 sum2 -= v2[j+1] * tmp0; 2597 sum3 -= v3[j+2] * tmp0; 2598 sum4 -= v4[j+3] * tmp0; 2599 } 2600 2601 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2602 sum2 -= v2[0] * tmp0; 2603 sum3 -= v3[1] * tmp0; 2604 sum4 -= v4[2] * tmp0; 2605 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2606 sum3 -= v3[0] * tmp0; 2607 sum4 -= v4[1] * tmp0; 2608 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2609 sum4 -= v4[0] * tmp0; 2610 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2611 break; 2612 case 5: 2613 sum1 = tmp[row]; 2614 sum2 = tmp[row -1]; 2615 sum3 = tmp[row -2]; 2616 sum4 = tmp[row -3]; 2617 sum5 = tmp[row -4]; 2618 v2 = aa + ad[row]+1; 2619 v3 = aa + ad[row -1]+1; 2620 v4 = aa + ad[row -2]+1; 2621 v5 = aa + ad[row -3]+1; 2622 for (j=0; j<nz-1; j+=2) { 2623 i0 = vi[j]; 2624 i1 = vi[j+1]; 2625 tmp0 = tmps[i0]; 2626 tmp1 = tmps[i1]; 2627 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2628 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2629 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2630 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2631 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2632 } 2633 if (j==nz-1) { 2634 tmp0 = tmps[vi[j]]; 2635 sum1 -= v1[j] * tmp0; 2636 sum2 -= v2[j+1] * tmp0; 2637 sum3 -= v3[j+2] * tmp0; 2638 sum4 -= v4[j+3] * tmp0; 2639 sum5 -= v5[j+4] * tmp0; 2640 } 2641 2642 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2643 sum2 -= v2[0] * tmp0; 2644 sum3 -= v3[1] * tmp0; 2645 sum4 -= v4[2] * tmp0; 2646 sum5 -= v5[3] * tmp0; 2647 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2648 sum3 -= v3[0] * tmp0; 2649 sum4 -= v4[1] * tmp0; 2650 sum5 -= v5[2] * tmp0; 2651 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2652 sum4 -= v4[0] * tmp0; 2653 sum5 -= v5[1] * tmp0; 2654 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2655 sum5 -= v5[0] * tmp0; 2656 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2657 break; 2658 default: 2659 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2660 } 2661 } 2662 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2663 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2664 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2665 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2666 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2667 PetscFunctionReturn(0); 2668 } 2669 2670 2671 /* 2672 Makes a longer coloring[] array and calls the usual code with that 2673 */ 2674 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2675 { 2676 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2677 PetscErrorCode ierr; 2678 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2679 PetscInt *colorused,i; 2680 ISColoringValue *newcolor; 2681 2682 PetscFunctionBegin; 2683 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2684 /* loop over inodes, marking a color for each column*/ 2685 row = 0; 2686 for (i=0; i<m; i++) { 2687 for (j=0; j<ns[i]; j++) { 2688 newcolor[row++] = coloring[i] + j*ncolors; 2689 } 2690 } 2691 2692 /* eliminate unneeded colors */ 2693 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2694 for (i=0; i<n; i++) { 2695 colorused[newcolor[i]] = 1; 2696 } 2697 2698 for (i=1; i<5*ncolors; i++) { 2699 colorused[i] += colorused[i-1]; 2700 } 2701 ncolors = colorused[5*ncolors-1]; 2702 for (i=0; i<n; i++) { 2703 newcolor[i] = colorused[newcolor[i]]-1; 2704 } 2705 ierr = PetscFree(colorused);CHKERRQ(ierr); 2706 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2707 ierr = PetscFree(coloring);CHKERRQ(ierr); 2708 PetscFunctionReturn(0); 2709 } 2710 2711 #include <petsc/private/kernels/blockinvert.h> 2712 2713 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2714 { 2715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2716 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2717 MatScalar *ibdiag,*bdiag,work[25],*t; 2718 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2719 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2720 const PetscScalar *xb, *b; 2721 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2722 PetscErrorCode ierr; 2723 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2724 PetscInt sz,k,ipvt[5]; 2725 PetscBool allowzeropivot,zeropivotdetected; 2726 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2727 2728 PetscFunctionBegin; 2729 allowzeropivot = PetscNot(A->erroriffailure); 2730 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2731 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2732 2733 if (!a->inode.ibdiagvalid) { 2734 if (!a->inode.ibdiag) { 2735 /* calculate space needed for diagonal blocks */ 2736 for (i=0; i<m; i++) { 2737 cnt += sizes[i]*sizes[i]; 2738 } 2739 a->inode.bdiagsize = cnt; 2740 2741 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2742 } 2743 2744 /* copy over the diagonal blocks and invert them */ 2745 ibdiag = a->inode.ibdiag; 2746 bdiag = a->inode.bdiag; 2747 cnt = 0; 2748 for (i=0, row = 0; i<m; i++) { 2749 for (j=0; j<sizes[i]; j++) { 2750 for (k=0; k<sizes[i]; k++) { 2751 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2752 } 2753 } 2754 ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr); 2755 2756 switch (sizes[i]) { 2757 case 1: 2758 /* Create matrix data structure */ 2759 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2760 if (allowzeropivot) { 2761 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2762 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2763 A->factorerror_zeropivot_row = row; 2764 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2765 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2766 } 2767 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2768 break; 2769 case 2: 2770 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2771 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2772 break; 2773 case 3: 2774 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2775 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2776 break; 2777 case 4: 2778 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2779 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2780 break; 2781 case 5: 2782 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2783 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2784 break; 2785 default: 2786 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2787 } 2788 cnt += sizes[i]*sizes[i]; 2789 row += sizes[i]; 2790 } 2791 a->inode.ibdiagvalid = PETSC_TRUE; 2792 } 2793 ibdiag = a->inode.ibdiag; 2794 bdiag = a->inode.bdiag; 2795 t = a->inode.ssor_work; 2796 2797 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2798 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2799 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2800 if (flag & SOR_ZERO_INITIAL_GUESS) { 2801 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2802 2803 for (i=0, row=0; i<m; i++) { 2804 sz = diag[row] - ii[row]; 2805 v1 = a->a + ii[row]; 2806 idx = a->j + ii[row]; 2807 2808 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2809 switch (sizes[i]) { 2810 case 1: 2811 2812 sum1 = b[row]; 2813 for (n = 0; n<sz-1; n+=2) { 2814 i1 = idx[0]; 2815 i2 = idx[1]; 2816 idx += 2; 2817 tmp0 = x[i1]; 2818 tmp1 = x[i2]; 2819 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2820 } 2821 2822 if (n == sz-1) { 2823 tmp0 = x[*idx]; 2824 sum1 -= *v1 * tmp0; 2825 } 2826 t[row] = sum1; 2827 x[row++] = sum1*(*ibdiag++); 2828 break; 2829 case 2: 2830 v2 = a->a + ii[row+1]; 2831 sum1 = b[row]; 2832 sum2 = b[row+1]; 2833 for (n = 0; n<sz-1; n+=2) { 2834 i1 = idx[0]; 2835 i2 = idx[1]; 2836 idx += 2; 2837 tmp0 = x[i1]; 2838 tmp1 = x[i2]; 2839 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2840 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2841 } 2842 2843 if (n == sz-1) { 2844 tmp0 = x[*idx]; 2845 sum1 -= v1[0] * tmp0; 2846 sum2 -= v2[0] * tmp0; 2847 } 2848 t[row] = sum1; 2849 t[row+1] = sum2; 2850 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2851 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2852 ibdiag += 4; 2853 break; 2854 case 3: 2855 v2 = a->a + ii[row+1]; 2856 v3 = a->a + ii[row+2]; 2857 sum1 = b[row]; 2858 sum2 = b[row+1]; 2859 sum3 = b[row+2]; 2860 for (n = 0; n<sz-1; n+=2) { 2861 i1 = idx[0]; 2862 i2 = idx[1]; 2863 idx += 2; 2864 tmp0 = x[i1]; 2865 tmp1 = x[i2]; 2866 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2867 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2868 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2869 } 2870 2871 if (n == sz-1) { 2872 tmp0 = x[*idx]; 2873 sum1 -= v1[0] * tmp0; 2874 sum2 -= v2[0] * tmp0; 2875 sum3 -= v3[0] * tmp0; 2876 } 2877 t[row] = sum1; 2878 t[row+1] = sum2; 2879 t[row+2] = sum3; 2880 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2881 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2882 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2883 ibdiag += 9; 2884 break; 2885 case 4: 2886 v2 = a->a + ii[row+1]; 2887 v3 = a->a + ii[row+2]; 2888 v4 = a->a + ii[row+3]; 2889 sum1 = b[row]; 2890 sum2 = b[row+1]; 2891 sum3 = b[row+2]; 2892 sum4 = b[row+3]; 2893 for (n = 0; n<sz-1; n+=2) { 2894 i1 = idx[0]; 2895 i2 = idx[1]; 2896 idx += 2; 2897 tmp0 = x[i1]; 2898 tmp1 = x[i2]; 2899 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2900 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2901 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2902 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2903 } 2904 2905 if (n == sz-1) { 2906 tmp0 = x[*idx]; 2907 sum1 -= v1[0] * tmp0; 2908 sum2 -= v2[0] * tmp0; 2909 sum3 -= v3[0] * tmp0; 2910 sum4 -= v4[0] * tmp0; 2911 } 2912 t[row] = sum1; 2913 t[row+1] = sum2; 2914 t[row+2] = sum3; 2915 t[row+3] = sum4; 2916 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2917 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2918 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2919 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2920 ibdiag += 16; 2921 break; 2922 case 5: 2923 v2 = a->a + ii[row+1]; 2924 v3 = a->a + ii[row+2]; 2925 v4 = a->a + ii[row+3]; 2926 v5 = a->a + ii[row+4]; 2927 sum1 = b[row]; 2928 sum2 = b[row+1]; 2929 sum3 = b[row+2]; 2930 sum4 = b[row+3]; 2931 sum5 = b[row+4]; 2932 for (n = 0; n<sz-1; n+=2) { 2933 i1 = idx[0]; 2934 i2 = idx[1]; 2935 idx += 2; 2936 tmp0 = x[i1]; 2937 tmp1 = x[i2]; 2938 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2939 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2940 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2941 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2942 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2943 } 2944 2945 if (n == sz-1) { 2946 tmp0 = x[*idx]; 2947 sum1 -= v1[0] * tmp0; 2948 sum2 -= v2[0] * tmp0; 2949 sum3 -= v3[0] * tmp0; 2950 sum4 -= v4[0] * tmp0; 2951 sum5 -= v5[0] * tmp0; 2952 } 2953 t[row] = sum1; 2954 t[row+1] = sum2; 2955 t[row+2] = sum3; 2956 t[row+3] = sum4; 2957 t[row+4] = sum5; 2958 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2959 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2960 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2961 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2962 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2963 ibdiag += 25; 2964 break; 2965 default: 2966 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2967 } 2968 } 2969 2970 xb = t; 2971 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2972 } else xb = b; 2973 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2974 2975 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2976 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2977 ibdiag -= sizes[i]*sizes[i]; 2978 sz = ii[row+1] - diag[row] - 1; 2979 v1 = a->a + diag[row] + 1; 2980 idx = a->j + diag[row] + 1; 2981 2982 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2983 switch (sizes[i]) { 2984 case 1: 2985 2986 sum1 = xb[row]; 2987 for (n = 0; n<sz-1; n+=2) { 2988 i1 = idx[0]; 2989 i2 = idx[1]; 2990 idx += 2; 2991 tmp0 = x[i1]; 2992 tmp1 = x[i2]; 2993 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2994 } 2995 2996 if (n == sz-1) { 2997 tmp0 = x[*idx]; 2998 sum1 -= *v1*tmp0; 2999 } 3000 x[row--] = sum1*(*ibdiag); 3001 break; 3002 3003 case 2: 3004 3005 sum1 = xb[row]; 3006 sum2 = xb[row-1]; 3007 /* note that sum1 is associated with the second of the two rows */ 3008 v2 = a->a + diag[row-1] + 2; 3009 for (n = 0; n<sz-1; n+=2) { 3010 i1 = idx[0]; 3011 i2 = idx[1]; 3012 idx += 2; 3013 tmp0 = x[i1]; 3014 tmp1 = x[i2]; 3015 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3016 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3017 } 3018 3019 if (n == sz-1) { 3020 tmp0 = x[*idx]; 3021 sum1 -= *v1*tmp0; 3022 sum2 -= *v2*tmp0; 3023 } 3024 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3025 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3026 break; 3027 case 3: 3028 3029 sum1 = xb[row]; 3030 sum2 = xb[row-1]; 3031 sum3 = xb[row-2]; 3032 v2 = a->a + diag[row-1] + 2; 3033 v3 = a->a + diag[row-2] + 3; 3034 for (n = 0; n<sz-1; n+=2) { 3035 i1 = idx[0]; 3036 i2 = idx[1]; 3037 idx += 2; 3038 tmp0 = x[i1]; 3039 tmp1 = x[i2]; 3040 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3041 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3042 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3043 } 3044 3045 if (n == sz-1) { 3046 tmp0 = x[*idx]; 3047 sum1 -= *v1*tmp0; 3048 sum2 -= *v2*tmp0; 3049 sum3 -= *v3*tmp0; 3050 } 3051 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3052 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3053 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3054 break; 3055 case 4: 3056 3057 sum1 = xb[row]; 3058 sum2 = xb[row-1]; 3059 sum3 = xb[row-2]; 3060 sum4 = xb[row-3]; 3061 v2 = a->a + diag[row-1] + 2; 3062 v3 = a->a + diag[row-2] + 3; 3063 v4 = a->a + diag[row-3] + 4; 3064 for (n = 0; n<sz-1; n+=2) { 3065 i1 = idx[0]; 3066 i2 = idx[1]; 3067 idx += 2; 3068 tmp0 = x[i1]; 3069 tmp1 = x[i2]; 3070 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3071 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3072 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3073 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3074 } 3075 3076 if (n == sz-1) { 3077 tmp0 = x[*idx]; 3078 sum1 -= *v1*tmp0; 3079 sum2 -= *v2*tmp0; 3080 sum3 -= *v3*tmp0; 3081 sum4 -= *v4*tmp0; 3082 } 3083 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3084 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3085 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3086 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3087 break; 3088 case 5: 3089 3090 sum1 = xb[row]; 3091 sum2 = xb[row-1]; 3092 sum3 = xb[row-2]; 3093 sum4 = xb[row-3]; 3094 sum5 = xb[row-4]; 3095 v2 = a->a + diag[row-1] + 2; 3096 v3 = a->a + diag[row-2] + 3; 3097 v4 = a->a + diag[row-3] + 4; 3098 v5 = a->a + diag[row-4] + 5; 3099 for (n = 0; n<sz-1; n+=2) { 3100 i1 = idx[0]; 3101 i2 = idx[1]; 3102 idx += 2; 3103 tmp0 = x[i1]; 3104 tmp1 = x[i2]; 3105 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3106 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3107 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3108 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3109 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3110 } 3111 3112 if (n == sz-1) { 3113 tmp0 = x[*idx]; 3114 sum1 -= *v1*tmp0; 3115 sum2 -= *v2*tmp0; 3116 sum3 -= *v3*tmp0; 3117 sum4 -= *v4*tmp0; 3118 sum5 -= *v5*tmp0; 3119 } 3120 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3121 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3122 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3123 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3124 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3125 break; 3126 default: 3127 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3128 } 3129 } 3130 3131 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3132 } 3133 its--; 3134 } 3135 while (its--) { 3136 3137 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3138 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3139 i<m; 3140 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3141 3142 sz = diag[row] - ii[row]; 3143 v1 = a->a + ii[row]; 3144 idx = a->j + ii[row]; 3145 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3146 switch (sizes[i]) { 3147 case 1: 3148 sum1 = b[row]; 3149 for (n = 0; n<sz-1; n+=2) { 3150 i1 = idx[0]; 3151 i2 = idx[1]; 3152 idx += 2; 3153 tmp0 = x[i1]; 3154 tmp1 = x[i2]; 3155 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3156 } 3157 if (n == sz-1) { 3158 tmp0 = x[*idx++]; 3159 sum1 -= *v1 * tmp0; 3160 v1++; 3161 } 3162 t[row] = sum1; 3163 sz = ii[row+1] - diag[row] - 1; 3164 idx = a->j + diag[row] + 1; 3165 v1 += 1; 3166 for (n = 0; n<sz-1; n+=2) { 3167 i1 = idx[0]; 3168 i2 = idx[1]; 3169 idx += 2; 3170 tmp0 = x[i1]; 3171 tmp1 = x[i2]; 3172 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3173 } 3174 if (n == sz-1) { 3175 tmp0 = x[*idx++]; 3176 sum1 -= *v1 * tmp0; 3177 } 3178 /* in MatSOR_SeqAIJ this line would be 3179 * 3180 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3181 * 3182 * but omega == 1, so this becomes 3183 * 3184 * x[row] = sum1*(*ibdiag++); 3185 * 3186 */ 3187 x[row] = sum1*(*ibdiag); 3188 break; 3189 case 2: 3190 v2 = a->a + ii[row+1]; 3191 sum1 = b[row]; 3192 sum2 = b[row+1]; 3193 for (n = 0; n<sz-1; n+=2) { 3194 i1 = idx[0]; 3195 i2 = idx[1]; 3196 idx += 2; 3197 tmp0 = x[i1]; 3198 tmp1 = x[i2]; 3199 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3200 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3201 } 3202 if (n == sz-1) { 3203 tmp0 = x[*idx++]; 3204 sum1 -= v1[0] * tmp0; 3205 sum2 -= v2[0] * tmp0; 3206 v1++; v2++; 3207 } 3208 t[row] = sum1; 3209 t[row+1] = sum2; 3210 sz = ii[row+1] - diag[row] - 2; 3211 idx = a->j + diag[row] + 2; 3212 v1 += 2; 3213 v2 += 2; 3214 for (n = 0; n<sz-1; n+=2) { 3215 i1 = idx[0]; 3216 i2 = idx[1]; 3217 idx += 2; 3218 tmp0 = x[i1]; 3219 tmp1 = x[i2]; 3220 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3221 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3222 } 3223 if (n == sz-1) { 3224 tmp0 = x[*idx]; 3225 sum1 -= v1[0] * tmp0; 3226 sum2 -= v2[0] * tmp0; 3227 } 3228 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3229 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3230 break; 3231 case 3: 3232 v2 = a->a + ii[row+1]; 3233 v3 = a->a + ii[row+2]; 3234 sum1 = b[row]; 3235 sum2 = b[row+1]; 3236 sum3 = b[row+2]; 3237 for (n = 0; n<sz-1; n+=2) { 3238 i1 = idx[0]; 3239 i2 = idx[1]; 3240 idx += 2; 3241 tmp0 = x[i1]; 3242 tmp1 = x[i2]; 3243 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3244 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3245 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3246 } 3247 if (n == sz-1) { 3248 tmp0 = x[*idx++]; 3249 sum1 -= v1[0] * tmp0; 3250 sum2 -= v2[0] * tmp0; 3251 sum3 -= v3[0] * tmp0; 3252 v1++; v2++; v3++; 3253 } 3254 t[row] = sum1; 3255 t[row+1] = sum2; 3256 t[row+2] = sum3; 3257 sz = ii[row+1] - diag[row] - 3; 3258 idx = a->j + diag[row] + 3; 3259 v1 += 3; 3260 v2 += 3; 3261 v3 += 3; 3262 for (n = 0; n<sz-1; n+=2) { 3263 i1 = idx[0]; 3264 i2 = idx[1]; 3265 idx += 2; 3266 tmp0 = x[i1]; 3267 tmp1 = x[i2]; 3268 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3269 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3270 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3271 } 3272 if (n == sz-1) { 3273 tmp0 = x[*idx]; 3274 sum1 -= v1[0] * tmp0; 3275 sum2 -= v2[0] * tmp0; 3276 sum3 -= v3[0] * tmp0; 3277 } 3278 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3279 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3280 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3281 break; 3282 case 4: 3283 v2 = a->a + ii[row+1]; 3284 v3 = a->a + ii[row+2]; 3285 v4 = a->a + ii[row+3]; 3286 sum1 = b[row]; 3287 sum2 = b[row+1]; 3288 sum3 = b[row+2]; 3289 sum4 = b[row+3]; 3290 for (n = 0; n<sz-1; n+=2) { 3291 i1 = idx[0]; 3292 i2 = idx[1]; 3293 idx += 2; 3294 tmp0 = x[i1]; 3295 tmp1 = x[i2]; 3296 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3297 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3298 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3299 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3300 } 3301 if (n == sz-1) { 3302 tmp0 = x[*idx++]; 3303 sum1 -= v1[0] * tmp0; 3304 sum2 -= v2[0] * tmp0; 3305 sum3 -= v3[0] * tmp0; 3306 sum4 -= v4[0] * tmp0; 3307 v1++; v2++; v3++; v4++; 3308 } 3309 t[row] = sum1; 3310 t[row+1] = sum2; 3311 t[row+2] = sum3; 3312 t[row+3] = sum4; 3313 sz = ii[row+1] - diag[row] - 4; 3314 idx = a->j + diag[row] + 4; 3315 v1 += 4; 3316 v2 += 4; 3317 v3 += 4; 3318 v4 += 4; 3319 for (n = 0; n<sz-1; n+=2) { 3320 i1 = idx[0]; 3321 i2 = idx[1]; 3322 idx += 2; 3323 tmp0 = x[i1]; 3324 tmp1 = x[i2]; 3325 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3326 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3327 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3328 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3329 } 3330 if (n == sz-1) { 3331 tmp0 = x[*idx]; 3332 sum1 -= v1[0] * tmp0; 3333 sum2 -= v2[0] * tmp0; 3334 sum3 -= v3[0] * tmp0; 3335 sum4 -= v4[0] * tmp0; 3336 } 3337 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3338 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3339 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3340 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3341 break; 3342 case 5: 3343 v2 = a->a + ii[row+1]; 3344 v3 = a->a + ii[row+2]; 3345 v4 = a->a + ii[row+3]; 3346 v5 = a->a + ii[row+4]; 3347 sum1 = b[row]; 3348 sum2 = b[row+1]; 3349 sum3 = b[row+2]; 3350 sum4 = b[row+3]; 3351 sum5 = b[row+4]; 3352 for (n = 0; n<sz-1; n+=2) { 3353 i1 = idx[0]; 3354 i2 = idx[1]; 3355 idx += 2; 3356 tmp0 = x[i1]; 3357 tmp1 = x[i2]; 3358 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3359 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3360 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3361 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3362 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3363 } 3364 if (n == sz-1) { 3365 tmp0 = x[*idx++]; 3366 sum1 -= v1[0] * tmp0; 3367 sum2 -= v2[0] * tmp0; 3368 sum3 -= v3[0] * tmp0; 3369 sum4 -= v4[0] * tmp0; 3370 sum5 -= v5[0] * tmp0; 3371 v1++; v2++; v3++; v4++; v5++; 3372 } 3373 t[row] = sum1; 3374 t[row+1] = sum2; 3375 t[row+2] = sum3; 3376 t[row+3] = sum4; 3377 t[row+4] = sum5; 3378 sz = ii[row+1] - diag[row] - 5; 3379 idx = a->j + diag[row] + 5; 3380 v1 += 5; 3381 v2 += 5; 3382 v3 += 5; 3383 v4 += 5; 3384 v5 += 5; 3385 for (n = 0; n<sz-1; n+=2) { 3386 i1 = idx[0]; 3387 i2 = idx[1]; 3388 idx += 2; 3389 tmp0 = x[i1]; 3390 tmp1 = x[i2]; 3391 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3392 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3393 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3394 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3395 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3396 } 3397 if (n == sz-1) { 3398 tmp0 = x[*idx]; 3399 sum1 -= v1[0] * tmp0; 3400 sum2 -= v2[0] * tmp0; 3401 sum3 -= v3[0] * tmp0; 3402 sum4 -= v4[0] * tmp0; 3403 sum5 -= v5[0] * tmp0; 3404 } 3405 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3406 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3407 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3408 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3409 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3410 break; 3411 default: 3412 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3413 } 3414 } 3415 xb = t; 3416 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3417 } else xb = b; 3418 3419 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3420 3421 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3422 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3423 ibdiag -= sizes[i]*sizes[i]; 3424 3425 /* set RHS */ 3426 if (xb == b) { 3427 /* whole (old way) */ 3428 sz = ii[row+1] - ii[row]; 3429 idx = a->j + ii[row]; 3430 switch (sizes[i]) { 3431 case 5: 3432 v5 = a->a + ii[row-4]; 3433 case 4: /* fall through */ 3434 v4 = a->a + ii[row-3]; 3435 case 3: 3436 v3 = a->a + ii[row-2]; 3437 case 2: 3438 v2 = a->a + ii[row-1]; 3439 case 1: 3440 v1 = a->a + ii[row]; 3441 break; 3442 default: 3443 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3444 } 3445 } else { 3446 /* upper, no diag */ 3447 sz = ii[row+1] - diag[row] - 1; 3448 idx = a->j + diag[row] + 1; 3449 switch (sizes[i]) { 3450 case 5: 3451 v5 = a->a + diag[row-4] + 5; 3452 case 4: /* fall through */ 3453 v4 = a->a + diag[row-3] + 4; 3454 case 3: 3455 v3 = a->a + diag[row-2] + 3; 3456 case 2: 3457 v2 = a->a + diag[row-1] + 2; 3458 case 1: 3459 v1 = a->a + diag[row] + 1; 3460 } 3461 } 3462 /* set sum */ 3463 switch (sizes[i]) { 3464 case 5: 3465 sum5 = xb[row-4]; 3466 case 4: /* fall through */ 3467 sum4 = xb[row-3]; 3468 case 3: 3469 sum3 = xb[row-2]; 3470 case 2: 3471 sum2 = xb[row-1]; 3472 case 1: 3473 /* note that sum1 is associated with the last row */ 3474 sum1 = xb[row]; 3475 } 3476 /* do sums */ 3477 for (n = 0; n<sz-1; n+=2) { 3478 i1 = idx[0]; 3479 i2 = idx[1]; 3480 idx += 2; 3481 tmp0 = x[i1]; 3482 tmp1 = x[i2]; 3483 switch (sizes[i]) { 3484 case 5: 3485 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3486 case 4: /* fall through */ 3487 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3488 case 3: 3489 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3490 case 2: 3491 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3492 case 1: 3493 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3494 } 3495 } 3496 /* ragged edge */ 3497 if (n == sz-1) { 3498 tmp0 = x[*idx]; 3499 switch (sizes[i]) { 3500 case 5: 3501 sum5 -= *v5*tmp0; 3502 case 4: /* fall through */ 3503 sum4 -= *v4*tmp0; 3504 case 3: 3505 sum3 -= *v3*tmp0; 3506 case 2: 3507 sum2 -= *v2*tmp0; 3508 case 1: 3509 sum1 -= *v1*tmp0; 3510 } 3511 } 3512 /* update */ 3513 if (xb == b) { 3514 /* whole (old way) w/ diag */ 3515 switch (sizes[i]) { 3516 case 5: 3517 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3518 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3519 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3520 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3521 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3522 break; 3523 case 4: 3524 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3525 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3526 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3527 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3528 break; 3529 case 3: 3530 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3531 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3532 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3533 break; 3534 case 2: 3535 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3536 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3537 break; 3538 case 1: 3539 x[row--] += sum1*(*ibdiag); 3540 break; 3541 } 3542 } else { 3543 /* no diag so set = */ 3544 switch (sizes[i]) { 3545 case 5: 3546 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3547 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3548 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3549 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3550 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3551 break; 3552 case 4: 3553 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3554 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3555 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3556 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3557 break; 3558 case 3: 3559 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3560 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3561 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3562 break; 3563 case 2: 3564 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3565 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3566 break; 3567 case 1: 3568 x[row--] = sum1*(*ibdiag); 3569 break; 3570 } 3571 } 3572 } 3573 if (xb == b) { 3574 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3575 } else { 3576 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3577 } 3578 } 3579 } 3580 if (flag & SOR_EISENSTAT) { 3581 /* 3582 Apply (U + D)^-1 where D is now the block diagonal 3583 */ 3584 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3585 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3586 ibdiag -= sizes[i]*sizes[i]; 3587 sz = ii[row+1] - diag[row] - 1; 3588 v1 = a->a + diag[row] + 1; 3589 idx = a->j + diag[row] + 1; 3590 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3591 switch (sizes[i]) { 3592 case 1: 3593 3594 sum1 = b[row]; 3595 for (n = 0; n<sz-1; n+=2) { 3596 i1 = idx[0]; 3597 i2 = idx[1]; 3598 idx += 2; 3599 tmp0 = x[i1]; 3600 tmp1 = x[i2]; 3601 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3602 } 3603 3604 if (n == sz-1) { 3605 tmp0 = x[*idx]; 3606 sum1 -= *v1*tmp0; 3607 } 3608 x[row] = sum1*(*ibdiag);row--; 3609 break; 3610 3611 case 2: 3612 3613 sum1 = b[row]; 3614 sum2 = b[row-1]; 3615 /* note that sum1 is associated with the second of the two rows */ 3616 v2 = a->a + diag[row-1] + 2; 3617 for (n = 0; n<sz-1; n+=2) { 3618 i1 = idx[0]; 3619 i2 = idx[1]; 3620 idx += 2; 3621 tmp0 = x[i1]; 3622 tmp1 = x[i2]; 3623 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3624 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3625 } 3626 3627 if (n == sz-1) { 3628 tmp0 = x[*idx]; 3629 sum1 -= *v1*tmp0; 3630 sum2 -= *v2*tmp0; 3631 } 3632 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3633 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3634 row -= 2; 3635 break; 3636 case 3: 3637 3638 sum1 = b[row]; 3639 sum2 = b[row-1]; 3640 sum3 = b[row-2]; 3641 v2 = a->a + diag[row-1] + 2; 3642 v3 = a->a + diag[row-2] + 3; 3643 for (n = 0; n<sz-1; n+=2) { 3644 i1 = idx[0]; 3645 i2 = idx[1]; 3646 idx += 2; 3647 tmp0 = x[i1]; 3648 tmp1 = x[i2]; 3649 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3650 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3651 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3652 } 3653 3654 if (n == sz-1) { 3655 tmp0 = x[*idx]; 3656 sum1 -= *v1*tmp0; 3657 sum2 -= *v2*tmp0; 3658 sum3 -= *v3*tmp0; 3659 } 3660 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3661 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3662 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3663 row -= 3; 3664 break; 3665 case 4: 3666 3667 sum1 = b[row]; 3668 sum2 = b[row-1]; 3669 sum3 = b[row-2]; 3670 sum4 = b[row-3]; 3671 v2 = a->a + diag[row-1] + 2; 3672 v3 = a->a + diag[row-2] + 3; 3673 v4 = a->a + diag[row-3] + 4; 3674 for (n = 0; n<sz-1; n+=2) { 3675 i1 = idx[0]; 3676 i2 = idx[1]; 3677 idx += 2; 3678 tmp0 = x[i1]; 3679 tmp1 = x[i2]; 3680 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3681 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3682 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3683 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3684 } 3685 3686 if (n == sz-1) { 3687 tmp0 = x[*idx]; 3688 sum1 -= *v1*tmp0; 3689 sum2 -= *v2*tmp0; 3690 sum3 -= *v3*tmp0; 3691 sum4 -= *v4*tmp0; 3692 } 3693 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3694 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3695 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3696 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3697 row -= 4; 3698 break; 3699 case 5: 3700 3701 sum1 = b[row]; 3702 sum2 = b[row-1]; 3703 sum3 = b[row-2]; 3704 sum4 = b[row-3]; 3705 sum5 = b[row-4]; 3706 v2 = a->a + diag[row-1] + 2; 3707 v3 = a->a + diag[row-2] + 3; 3708 v4 = a->a + diag[row-3] + 4; 3709 v5 = a->a + diag[row-4] + 5; 3710 for (n = 0; n<sz-1; n+=2) { 3711 i1 = idx[0]; 3712 i2 = idx[1]; 3713 idx += 2; 3714 tmp0 = x[i1]; 3715 tmp1 = x[i2]; 3716 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3717 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3718 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3719 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3720 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3721 } 3722 3723 if (n == sz-1) { 3724 tmp0 = x[*idx]; 3725 sum1 -= *v1*tmp0; 3726 sum2 -= *v2*tmp0; 3727 sum3 -= *v3*tmp0; 3728 sum4 -= *v4*tmp0; 3729 sum5 -= *v5*tmp0; 3730 } 3731 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3732 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3733 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3734 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3735 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3736 row -= 5; 3737 break; 3738 default: 3739 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3740 } 3741 } 3742 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3743 3744 /* 3745 t = b - D x where D is the block diagonal 3746 */ 3747 cnt = 0; 3748 for (i=0, row=0; i<m; i++) { 3749 switch (sizes[i]) { 3750 case 1: 3751 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3752 break; 3753 case 2: 3754 x1 = x[row]; x2 = x[row+1]; 3755 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3756 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3757 t[row] = b[row] - tmp1; 3758 t[row+1] = b[row+1] - tmp2; row += 2; 3759 cnt += 4; 3760 break; 3761 case 3: 3762 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3763 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3764 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3765 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3766 t[row] = b[row] - tmp1; 3767 t[row+1] = b[row+1] - tmp2; 3768 t[row+2] = b[row+2] - tmp3; row += 3; 3769 cnt += 9; 3770 break; 3771 case 4: 3772 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3773 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3774 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3775 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3776 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3777 t[row] = b[row] - tmp1; 3778 t[row+1] = b[row+1] - tmp2; 3779 t[row+2] = b[row+2] - tmp3; 3780 t[row+3] = b[row+3] - tmp4; row += 4; 3781 cnt += 16; 3782 break; 3783 case 5: 3784 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3785 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3786 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3787 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3788 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3789 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3790 t[row] = b[row] - tmp1; 3791 t[row+1] = b[row+1] - tmp2; 3792 t[row+2] = b[row+2] - tmp3; 3793 t[row+3] = b[row+3] - tmp4; 3794 t[row+4] = b[row+4] - tmp5;row += 5; 3795 cnt += 25; 3796 break; 3797 default: 3798 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3799 } 3800 } 3801 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3802 3803 3804 3805 /* 3806 Apply (L + D)^-1 where D is the block diagonal 3807 */ 3808 for (i=0, row=0; i<m; i++) { 3809 sz = diag[row] - ii[row]; 3810 v1 = a->a + ii[row]; 3811 idx = a->j + ii[row]; 3812 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3813 switch (sizes[i]) { 3814 case 1: 3815 3816 sum1 = t[row]; 3817 for (n = 0; n<sz-1; n+=2) { 3818 i1 = idx[0]; 3819 i2 = idx[1]; 3820 idx += 2; 3821 tmp0 = t[i1]; 3822 tmp1 = t[i2]; 3823 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3824 } 3825 3826 if (n == sz-1) { 3827 tmp0 = t[*idx]; 3828 sum1 -= *v1 * tmp0; 3829 } 3830 x[row] += t[row] = sum1*(*ibdiag++); row++; 3831 break; 3832 case 2: 3833 v2 = a->a + ii[row+1]; 3834 sum1 = t[row]; 3835 sum2 = t[row+1]; 3836 for (n = 0; n<sz-1; n+=2) { 3837 i1 = idx[0]; 3838 i2 = idx[1]; 3839 idx += 2; 3840 tmp0 = t[i1]; 3841 tmp1 = t[i2]; 3842 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3843 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3844 } 3845 3846 if (n == sz-1) { 3847 tmp0 = t[*idx]; 3848 sum1 -= v1[0] * tmp0; 3849 sum2 -= v2[0] * tmp0; 3850 } 3851 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3852 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3853 ibdiag += 4; row += 2; 3854 break; 3855 case 3: 3856 v2 = a->a + ii[row+1]; 3857 v3 = a->a + ii[row+2]; 3858 sum1 = t[row]; 3859 sum2 = t[row+1]; 3860 sum3 = t[row+2]; 3861 for (n = 0; n<sz-1; n+=2) { 3862 i1 = idx[0]; 3863 i2 = idx[1]; 3864 idx += 2; 3865 tmp0 = t[i1]; 3866 tmp1 = t[i2]; 3867 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3868 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3869 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3870 } 3871 3872 if (n == sz-1) { 3873 tmp0 = t[*idx]; 3874 sum1 -= v1[0] * tmp0; 3875 sum2 -= v2[0] * tmp0; 3876 sum3 -= v3[0] * tmp0; 3877 } 3878 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3879 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3880 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3881 ibdiag += 9; row += 3; 3882 break; 3883 case 4: 3884 v2 = a->a + ii[row+1]; 3885 v3 = a->a + ii[row+2]; 3886 v4 = a->a + ii[row+3]; 3887 sum1 = t[row]; 3888 sum2 = t[row+1]; 3889 sum3 = t[row+2]; 3890 sum4 = t[row+3]; 3891 for (n = 0; n<sz-1; n+=2) { 3892 i1 = idx[0]; 3893 i2 = idx[1]; 3894 idx += 2; 3895 tmp0 = t[i1]; 3896 tmp1 = t[i2]; 3897 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3898 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3899 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3900 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3901 } 3902 3903 if (n == sz-1) { 3904 tmp0 = t[*idx]; 3905 sum1 -= v1[0] * tmp0; 3906 sum2 -= v2[0] * tmp0; 3907 sum3 -= v3[0] * tmp0; 3908 sum4 -= v4[0] * tmp0; 3909 } 3910 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3911 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3912 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3913 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3914 ibdiag += 16; row += 4; 3915 break; 3916 case 5: 3917 v2 = a->a + ii[row+1]; 3918 v3 = a->a + ii[row+2]; 3919 v4 = a->a + ii[row+3]; 3920 v5 = a->a + ii[row+4]; 3921 sum1 = t[row]; 3922 sum2 = t[row+1]; 3923 sum3 = t[row+2]; 3924 sum4 = t[row+3]; 3925 sum5 = t[row+4]; 3926 for (n = 0; n<sz-1; n+=2) { 3927 i1 = idx[0]; 3928 i2 = idx[1]; 3929 idx += 2; 3930 tmp0 = t[i1]; 3931 tmp1 = t[i2]; 3932 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3933 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3934 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3935 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3936 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3937 } 3938 3939 if (n == sz-1) { 3940 tmp0 = t[*idx]; 3941 sum1 -= v1[0] * tmp0; 3942 sum2 -= v2[0] * tmp0; 3943 sum3 -= v3[0] * tmp0; 3944 sum4 -= v4[0] * tmp0; 3945 sum5 -= v5[0] * tmp0; 3946 } 3947 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3948 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3949 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3950 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3951 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3952 ibdiag += 25; row += 5; 3953 break; 3954 default: 3955 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3956 } 3957 } 3958 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3959 } 3960 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3961 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3962 PetscFunctionReturn(0); 3963 } 3964 3965 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3966 { 3967 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3968 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3969 const MatScalar *bdiag = a->inode.bdiag; 3970 const PetscScalar *b; 3971 PetscErrorCode ierr; 3972 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3973 const PetscInt *sizes = a->inode.size; 3974 3975 PetscFunctionBegin; 3976 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3977 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3978 cnt = 0; 3979 for (i=0, row=0; i<m; i++) { 3980 switch (sizes[i]) { 3981 case 1: 3982 x[row] = b[row]*bdiag[cnt++];row++; 3983 break; 3984 case 2: 3985 x1 = b[row]; x2 = b[row+1]; 3986 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3987 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3988 x[row++] = tmp1; 3989 x[row++] = tmp2; 3990 cnt += 4; 3991 break; 3992 case 3: 3993 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3994 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3995 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3996 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3997 x[row++] = tmp1; 3998 x[row++] = tmp2; 3999 x[row++] = tmp3; 4000 cnt += 9; 4001 break; 4002 case 4: 4003 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4004 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4005 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4006 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4007 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4008 x[row++] = tmp1; 4009 x[row++] = tmp2; 4010 x[row++] = tmp3; 4011 x[row++] = tmp4; 4012 cnt += 16; 4013 break; 4014 case 5: 4015 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4016 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4017 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4018 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4019 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4020 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4021 x[row++] = tmp1; 4022 x[row++] = tmp2; 4023 x[row++] = tmp3; 4024 x[row++] = tmp4; 4025 x[row++] = tmp5; 4026 cnt += 25; 4027 break; 4028 default: 4029 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4030 } 4031 } 4032 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4033 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4034 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4035 PetscFunctionReturn(0); 4036 } 4037 4038 /* 4039 samestructure indicates that the matrix has not changed its nonzero structure so we 4040 do not need to recompute the inodes 4041 */ 4042 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4043 { 4044 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4045 PetscErrorCode ierr; 4046 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4047 PetscBool flag; 4048 const PetscInt *idx,*idy,*ii; 4049 4050 PetscFunctionBegin; 4051 if (!a->inode.use) PetscFunctionReturn(0); 4052 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4053 4054 m = A->rmap->n; 4055 if (a->inode.size) ns = a->inode.size; 4056 else { 4057 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4058 } 4059 4060 i = 0; 4061 node_count = 0; 4062 idx = a->j; 4063 ii = a->i; 4064 while (i < m) { /* For each row */ 4065 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4066 /* Limits the number of elements in a node to 'a->inode.limit' */ 4067 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4068 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4069 if (nzy != nzx) break; 4070 idy += nzx; /* Same nonzero pattern */ 4071 ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr); 4072 if (!flag) break; 4073 } 4074 ns[node_count++] = blk_size; 4075 idx += blk_size*nzx; 4076 i = j; 4077 } 4078 /* If not enough inodes found,, do not use inode version of the routines */ 4079 if (!m || node_count > .8*m) { 4080 ierr = PetscFree(ns);CHKERRQ(ierr); 4081 4082 a->inode.node_count = 0; 4083 a->inode.size = NULL; 4084 a->inode.use = PETSC_FALSE; 4085 A->ops->mult = MatMult_SeqAIJ; 4086 A->ops->sor = MatSOR_SeqAIJ; 4087 A->ops->multadd = MatMultAdd_SeqAIJ; 4088 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4089 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4090 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4091 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4092 A->ops->coloringpatch = 0; 4093 A->ops->multdiagonalblock = 0; 4094 4095 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4096 } else { 4097 if (!A->factortype) { 4098 A->ops->mult = MatMult_SeqAIJ_Inode; 4099 A->ops->sor = MatSOR_SeqAIJ_Inode; 4100 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4101 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4102 if (A->rmap->n == A->cmap->n) { 4103 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4104 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4105 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4106 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4107 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4108 } 4109 } else { 4110 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4111 } 4112 a->inode.node_count = node_count; 4113 a->inode.size = ns; 4114 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4115 } 4116 a->inode.checked = PETSC_TRUE; 4117 a->inode.mat_nonzerostate = A->nonzerostate; 4118 PetscFunctionReturn(0); 4119 } 4120 4121 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4122 { 4123 Mat B =*C; 4124 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4125 PetscErrorCode ierr; 4126 PetscInt m=A->rmap->n; 4127 4128 PetscFunctionBegin; 4129 c->inode.use = a->inode.use; 4130 c->inode.limit = a->inode.limit; 4131 c->inode.max_limit = a->inode.max_limit; 4132 if (a->inode.size) { 4133 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4134 c->inode.node_count = a->inode.node_count; 4135 ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr); 4136 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4137 if (!B->factortype) { 4138 B->ops->mult = MatMult_SeqAIJ_Inode; 4139 B->ops->sor = MatSOR_SeqAIJ_Inode; 4140 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4141 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4142 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4143 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4144 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4145 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4146 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4147 } else { 4148 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4149 } 4150 } else { 4151 c->inode.size = 0; 4152 c->inode.node_count = 0; 4153 } 4154 c->inode.ibdiagvalid = PETSC_FALSE; 4155 c->inode.ibdiag = 0; 4156 c->inode.bdiag = 0; 4157 PetscFunctionReturn(0); 4158 } 4159 4160 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4161 { 4162 PetscInt k; 4163 const PetscInt *vi; 4164 4165 PetscFunctionBegin; 4166 vi = aj + ai[row]; 4167 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4168 vi = aj + adiag[row]; 4169 cols[nzl] = vi[0]; 4170 vi = aj + adiag[row+1]+1; 4171 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4172 PetscFunctionReturn(0); 4173 } 4174 /* 4175 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4176 Modified from MatSeqAIJCheckInode(). 4177 4178 Input Parameters: 4179 . Mat A - ILU or LU matrix factor 4180 4181 */ 4182 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4183 { 4184 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4185 PetscErrorCode ierr; 4186 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4187 PetscInt *cols1,*cols2,*ns; 4188 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4189 PetscBool flag; 4190 4191 PetscFunctionBegin; 4192 if (!a->inode.use) PetscFunctionReturn(0); 4193 if (a->inode.checked) PetscFunctionReturn(0); 4194 4195 m = A->rmap->n; 4196 if (a->inode.size) ns = a->inode.size; 4197 else { 4198 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4199 } 4200 4201 i = 0; 4202 node_count = 0; 4203 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4204 while (i < m) { /* For each row */ 4205 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4206 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4207 nzx = nzl1 + nzu1 + 1; 4208 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4209 4210 /* Limits the number of elements in a node to 'a->inode.limit' */ 4211 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4212 nzl2 = ai[j+1] - ai[j]; 4213 nzu2 = adiag[j] - adiag[j+1] - 1; 4214 nzy = nzl2 + nzu2 + 1; 4215 if (nzy != nzx) break; 4216 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4217 ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr); 4218 if (!flag) break; 4219 } 4220 ns[node_count++] = blk_size; 4221 i = j; 4222 } 4223 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4224 /* If not enough inodes found,, do not use inode version of the routines */ 4225 if (!m || node_count > .8*m) { 4226 ierr = PetscFree(ns);CHKERRQ(ierr); 4227 4228 a->inode.node_count = 0; 4229 a->inode.size = NULL; 4230 a->inode.use = PETSC_FALSE; 4231 4232 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4233 } else { 4234 A->ops->mult = 0; 4235 A->ops->sor = 0; 4236 A->ops->multadd = 0; 4237 A->ops->getrowij = 0; 4238 A->ops->restorerowij = 0; 4239 A->ops->getcolumnij = 0; 4240 A->ops->restorecolumnij = 0; 4241 A->ops->coloringpatch = 0; 4242 A->ops->multdiagonalblock = 0; 4243 a->inode.node_count = node_count; 4244 a->inode.size = ns; 4245 4246 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4247 } 4248 a->inode.checked = PETSC_TRUE; 4249 PetscFunctionReturn(0); 4250 } 4251 4252 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4253 { 4254 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4255 4256 PetscFunctionBegin; 4257 a->inode.ibdiagvalid = PETSC_FALSE; 4258 PetscFunctionReturn(0); 4259 } 4260 4261 /* 4262 This is really ugly. if inodes are used this replaces the 4263 permutations with ones that correspond to rows/cols of the matrix 4264 rather then inode blocks 4265 */ 4266 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4267 { 4268 PetscErrorCode ierr; 4269 4270 PetscFunctionBegin; 4271 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4272 PetscFunctionReturn(0); 4273 } 4274 4275 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4276 { 4277 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4278 PetscErrorCode ierr; 4279 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4280 const PetscInt *ridx,*cidx; 4281 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4282 PetscInt nslim_col,*ns_col; 4283 IS ris = *rperm,cis = *cperm; 4284 4285 PetscFunctionBegin; 4286 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4287 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4288 4289 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4290 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4291 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4292 4293 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4294 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4295 4296 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4297 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4298 4299 /* Construct the permutations for rows*/ 4300 for (i=0,row = 0; i<nslim_row; ++i) { 4301 indx = ridx[i]; 4302 start_val = tns[indx]; 4303 end_val = tns[indx + 1]; 4304 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4305 } 4306 4307 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4308 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4309 4310 /* Construct permutations for columns */ 4311 for (i=0,col=0; i<nslim_col; ++i) { 4312 indx = cidx[i]; 4313 start_val = tns[indx]; 4314 end_val = tns[indx + 1]; 4315 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4316 } 4317 4318 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4319 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4320 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4321 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4322 4323 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4324 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4325 4326 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4327 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4328 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4329 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4330 ierr = PetscFree(tns);CHKERRQ(ierr); 4331 PetscFunctionReturn(0); 4332 } 4333 4334 /*@C 4335 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4336 4337 Not Collective 4338 4339 Input Parameter: 4340 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4341 4342 Output Parameter: 4343 + node_count - no of inodes present in the matrix. 4344 . sizes - an array of size node_count,with sizes of each inode. 4345 - limit - the max size used to generate the inodes. 4346 4347 Level: advanced 4348 4349 Notes: 4350 This routine returns some internal storage information 4351 of the matrix, it is intended to be used by advanced users. 4352 It should be called after the matrix is assembled. 4353 The contents of the sizes[] array should not be changed. 4354 NULL may be passed for information not requested. 4355 4356 .seealso: MatGetInfo() 4357 @*/ 4358 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4359 { 4360 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4361 4362 PetscFunctionBegin; 4363 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4364 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4365 if (f) { 4366 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4367 } 4368 PetscFunctionReturn(0); 4369 } 4370 4371 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4372 { 4373 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4374 4375 PetscFunctionBegin; 4376 if (node_count) *node_count = a->inode.node_count; 4377 if (sizes) *sizes = a->inode.size; 4378 if (limit) *limit = a->inode.limit; 4379 PetscFunctionReturn(0); 4380 } 4381