1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 600 if (zz != yy) { 601 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 602 } else { 603 z = y; 604 } 605 zt = z; 606 607 idx = a->j; 608 v1 = a->a; 609 ii = a->i; 610 611 for (i = 0,row = 0; i< node_max; ++i) { 612 nsz = ns[i]; 613 n = ii[1] - ii[0]; 614 ii += nsz; 615 sz = n; /* No of non zeros in this row */ 616 /* Switch on the size of Node */ 617 switch (nsz) { /* Each loop in 'case' is unrolled */ 618 case 1: 619 sum1 = *zt++; 620 621 for (n = 0; n< sz-1; n+=2) { 622 i1 = idx[0]; /* The instructions are ordered to */ 623 i2 = idx[1]; /* make the compiler's job easy */ 624 idx += 2; 625 tmp0 = x[i1]; 626 tmp1 = x[i2]; 627 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 628 } 629 630 if (n == sz-1) { /* Take care of the last nonzero */ 631 tmp0 = x[*idx++]; 632 sum1 += *v1++ * tmp0; 633 } 634 y[row++]=sum1; 635 break; 636 case 2: 637 sum1 = *zt++; 638 sum2 = *zt++; 639 v2 = v1 + n; 640 641 for (n = 0; n< sz-1; n+=2) { 642 i1 = idx[0]; 643 i2 = idx[1]; 644 idx += 2; 645 tmp0 = x[i1]; 646 tmp1 = x[i2]; 647 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 648 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 649 } 650 if (n == sz-1) { 651 tmp0 = x[*idx++]; 652 sum1 += *v1++ * tmp0; 653 sum2 += *v2++ * tmp0; 654 } 655 y[row++]=sum1; 656 y[row++]=sum2; 657 v1 =v2; /* Since the next block to be processed starts there*/ 658 idx +=sz; 659 break; 660 case 3: 661 sum1 = *zt++; 662 sum2 = *zt++; 663 sum3 = *zt++; 664 v2 = v1 + n; 665 v3 = v2 + n; 666 667 for (n = 0; n< sz-1; n+=2) { 668 i1 = idx[0]; 669 i2 = idx[1]; 670 idx += 2; 671 tmp0 = x[i1]; 672 tmp1 = x[i2]; 673 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 674 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 675 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 676 } 677 if (n == sz-1) { 678 tmp0 = x[*idx++]; 679 sum1 += *v1++ * tmp0; 680 sum2 += *v2++ * tmp0; 681 sum3 += *v3++ * tmp0; 682 } 683 y[row++]=sum1; 684 y[row++]=sum2; 685 y[row++]=sum3; 686 v1 =v3; /* Since the next block to be processed starts there*/ 687 idx +=2*sz; 688 break; 689 case 4: 690 sum1 = *zt++; 691 sum2 = *zt++; 692 sum3 = *zt++; 693 sum4 = *zt++; 694 v2 = v1 + n; 695 v3 = v2 + n; 696 v4 = v3 + n; 697 698 for (n = 0; n< sz-1; n+=2) { 699 i1 = idx[0]; 700 i2 = idx[1]; 701 idx += 2; 702 tmp0 = x[i1]; 703 tmp1 = x[i2]; 704 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 705 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 706 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 707 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 708 } 709 if (n == sz-1) { 710 tmp0 = x[*idx++]; 711 sum1 += *v1++ * tmp0; 712 sum2 += *v2++ * tmp0; 713 sum3 += *v3++ * tmp0; 714 sum4 += *v4++ * tmp0; 715 } 716 y[row++]=sum1; 717 y[row++]=sum2; 718 y[row++]=sum3; 719 y[row++]=sum4; 720 v1 =v4; /* Since the next block to be processed starts there*/ 721 idx +=3*sz; 722 break; 723 case 5: 724 sum1 = *zt++; 725 sum2 = *zt++; 726 sum3 = *zt++; 727 sum4 = *zt++; 728 sum5 = *zt++; 729 v2 = v1 + n; 730 v3 = v2 + n; 731 v4 = v3 + n; 732 v5 = v4 + n; 733 734 for (n = 0; n<sz-1; n+=2) { 735 i1 = idx[0]; 736 i2 = idx[1]; 737 idx += 2; 738 tmp0 = x[i1]; 739 tmp1 = x[i2]; 740 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 741 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 742 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 743 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 744 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 745 } 746 if (n == sz-1) { 747 tmp0 = x[*idx++]; 748 sum1 += *v1++ * tmp0; 749 sum2 += *v2++ * tmp0; 750 sum3 += *v3++ * tmp0; 751 sum4 += *v4++ * tmp0; 752 sum5 += *v5++ * tmp0; 753 } 754 y[row++]=sum1; 755 y[row++]=sum2; 756 y[row++]=sum3; 757 y[row++]=sum4; 758 y[row++]=sum5; 759 v1 =v5; /* Since the next block to be processed starts there */ 760 idx +=4*sz; 761 break; 762 default: 763 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 764 } 765 } 766 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 767 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 768 if (zz != yy) { 769 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 770 } 771 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 /* ----------------------------------------------------------- */ 776 #undef __FUNCT__ 777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 779 { 780 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 781 IS iscol = a->col,isrow = a->row; 782 PetscErrorCode ierr; 783 const PetscInt *r,*c,*rout,*cout; 784 PetscInt i,j,n = A->rmap->n,nz; 785 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 786 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp; 806 aa = a_a; 807 aj = a_j; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i) { 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 if (i < node_max-1) { 817 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 818 * but our indexing to determine it's size could. */ 819 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 820 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 821 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 822 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 823 } 824 825 switch (nsz) { /* Each loop in 'case' is unrolled */ 826 case 1: 827 sum1 = b[*r++]; 828 for (j=0; j<nz-1; j+=2) { 829 i0 = vi[0]; 830 i1 = vi[1]; 831 vi +=2; 832 tmp0 = tmps[i0]; 833 tmp1 = tmps[i1]; 834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 835 } 836 if (j == nz-1) { 837 tmp0 = tmps[*vi++]; 838 sum1 -= *v1++ *tmp0; 839 } 840 tmp[row++]=sum1; 841 break; 842 case 2: 843 sum1 = b[*r++]; 844 sum2 = b[*r++]; 845 v2 = aa + ai[row+1]; 846 847 for (j=0; j<nz-1; j+=2) { 848 i0 = vi[0]; 849 i1 = vi[1]; 850 vi +=2; 851 tmp0 = tmps[i0]; 852 tmp1 = tmps[i1]; 853 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 854 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 855 } 856 if (j == nz-1) { 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 } 861 sum2 -= *v2++ *sum1; 862 tmp[row++]=sum1; 863 tmp[row++]=sum2; 864 break; 865 case 3: 866 sum1 = b[*r++]; 867 sum2 = b[*r++]; 868 sum3 = b[*r++]; 869 v2 = aa + ai[row+1]; 870 v3 = aa + ai[row+2]; 871 872 for (j=0; j<nz-1; j+=2) { 873 i0 = vi[0]; 874 i1 = vi[1]; 875 vi +=2; 876 tmp0 = tmps[i0]; 877 tmp1 = tmps[i1]; 878 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 879 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 880 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 881 } 882 if (j == nz-1) { 883 tmp0 = tmps[*vi++]; 884 sum1 -= *v1++ *tmp0; 885 sum2 -= *v2++ *tmp0; 886 sum3 -= *v3++ *tmp0; 887 } 888 sum2 -= *v2++ * sum1; 889 sum3 -= *v3++ * sum1; 890 sum3 -= *v3++ * sum2; 891 892 tmp[row++]=sum1; 893 tmp[row++]=sum2; 894 tmp[row++]=sum3; 895 break; 896 897 case 4: 898 sum1 = b[*r++]; 899 sum2 = b[*r++]; 900 sum3 = b[*r++]; 901 sum4 = b[*r++]; 902 v2 = aa + ai[row+1]; 903 v3 = aa + ai[row+2]; 904 v4 = aa + ai[row+3]; 905 906 for (j=0; j<nz-1; j+=2) { 907 i0 = vi[0]; 908 i1 = vi[1]; 909 vi +=2; 910 tmp0 = tmps[i0]; 911 tmp1 = tmps[i1]; 912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 916 } 917 if (j == nz-1) { 918 tmp0 = tmps[*vi++]; 919 sum1 -= *v1++ *tmp0; 920 sum2 -= *v2++ *tmp0; 921 sum3 -= *v3++ *tmp0; 922 sum4 -= *v4++ *tmp0; 923 } 924 sum2 -= *v2++ * sum1; 925 sum3 -= *v3++ * sum1; 926 sum4 -= *v4++ * sum1; 927 sum3 -= *v3++ * sum2; 928 sum4 -= *v4++ * sum2; 929 sum4 -= *v4++ * sum3; 930 931 tmp[row++]=sum1; 932 tmp[row++]=sum2; 933 tmp[row++]=sum3; 934 tmp[row++]=sum4; 935 break; 936 case 5: 937 sum1 = b[*r++]; 938 sum2 = b[*r++]; 939 sum3 = b[*r++]; 940 sum4 = b[*r++]; 941 sum5 = b[*r++]; 942 v2 = aa + ai[row+1]; 943 v3 = aa + ai[row+2]; 944 v4 = aa + ai[row+3]; 945 v5 = aa + ai[row+4]; 946 947 for (j=0; j<nz-1; j+=2) { 948 i0 = vi[0]; 949 i1 = vi[1]; 950 vi +=2; 951 tmp0 = tmps[i0]; 952 tmp1 = tmps[i1]; 953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 954 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 955 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 957 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 958 } 959 if (j == nz-1) { 960 tmp0 = tmps[*vi++]; 961 sum1 -= *v1++ *tmp0; 962 sum2 -= *v2++ *tmp0; 963 sum3 -= *v3++ *tmp0; 964 sum4 -= *v4++ *tmp0; 965 sum5 -= *v5++ *tmp0; 966 } 967 968 sum2 -= *v2++ * sum1; 969 sum3 -= *v3++ * sum1; 970 sum4 -= *v4++ * sum1; 971 sum5 -= *v5++ * sum1; 972 sum3 -= *v3++ * sum2; 973 sum4 -= *v4++ * sum2; 974 sum5 -= *v5++ * sum2; 975 sum4 -= *v4++ * sum3; 976 sum5 -= *v5++ * sum3; 977 sum5 -= *v5++ * sum4; 978 979 tmp[row++]=sum1; 980 tmp[row++]=sum2; 981 tmp[row++]=sum3; 982 tmp[row++]=sum4; 983 tmp[row++]=sum5; 984 break; 985 default: 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 987 } 988 } 989 /* backward solve the upper triangular */ 990 for (i=node_max -1,row = n-1; i>=0; i--) { 991 nsz = ns[i]; 992 aii = ai[row+1] -1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii- ad[row]; 996 switch (nsz) { /* Each loop in 'case' is unrolled */ 997 case 1: 998 sum1 = tmp[row]; 999 1000 for (j=nz; j>1; j-=2) { 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j==1) { 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1014 break; 1015 case 2: 1016 sum1 = tmp[row]; 1017 sum2 = tmp[row -1]; 1018 v2 = aa + ai[row]-1; 1019 for (j=nz; j>1; j-=2) { 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1028 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1029 } 1030 if (j==1) { 1031 tmp0 = tmps[*vi--]; 1032 sum1 -= *v1-- * tmp0; 1033 sum2 -= *v2-- * tmp0; 1034 } 1035 1036 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1037 sum2 -= *v2-- * tmp0; 1038 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1039 break; 1040 case 3: 1041 sum1 = tmp[row]; 1042 sum2 = tmp[row -1]; 1043 sum3 = tmp[row -2]; 1044 v2 = aa + ai[row]-1; 1045 v3 = aa + ai[row -1]-1; 1046 for (j=nz; j>1; j-=2) { 1047 vi -=2; 1048 i0 = vi[2]; 1049 i1 = vi[1]; 1050 tmp0 = tmps[i0]; 1051 tmp1 = tmps[i1]; 1052 v1 -= 2; 1053 v2 -= 2; 1054 v3 -= 2; 1055 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1056 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1057 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1058 } 1059 if (j==1) { 1060 tmp0 = tmps[*vi--]; 1061 sum1 -= *v1-- * tmp0; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 } 1065 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1066 sum2 -= *v2-- * tmp0; 1067 sum3 -= *v3-- * tmp0; 1068 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1071 1072 break; 1073 case 4: 1074 sum1 = tmp[row]; 1075 sum2 = tmp[row -1]; 1076 sum3 = tmp[row -2]; 1077 sum4 = tmp[row -3]; 1078 v2 = aa + ai[row]-1; 1079 v3 = aa + ai[row -1]-1; 1080 v4 = aa + ai[row -2]-1; 1081 1082 for (j=nz; j>1; j-=2) { 1083 vi -=2; 1084 i0 = vi[2]; 1085 i1 = vi[1]; 1086 tmp0 = tmps[i0]; 1087 tmp1 = tmps[i1]; 1088 v1 -= 2; 1089 v2 -= 2; 1090 v3 -= 2; 1091 v4 -= 2; 1092 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1093 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1094 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1095 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1096 } 1097 if (j==1) { 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 } 1104 1105 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1113 sum4 -= *v4-- * tmp0; 1114 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1115 break; 1116 case 5: 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row -1]; 1119 sum3 = tmp[row -2]; 1120 sum4 = tmp[row -3]; 1121 sum5 = tmp[row -4]; 1122 v2 = aa + ai[row]-1; 1123 v3 = aa + ai[row -1]-1; 1124 v4 = aa + ai[row -2]-1; 1125 v5 = aa + ai[row -3]-1; 1126 for (j=nz; j>1; j-=2) { 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j==1) { 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1153 sum2 -= *v2-- * tmp0; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1162 sum4 -= *v4-- * tmp0; 1163 sum5 -= *v5-- * tmp0; 1164 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1165 sum5 -= *v5-- * tmp0; 1166 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1167 break; 1168 default: 1169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1170 } 1171 } 1172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #undef __FUNCT__ 1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1183 { 1184 Mat C =B; 1185 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1186 IS isrow = b->row,isicol = b->icol; 1187 PetscErrorCode ierr; 1188 const PetscInt *r,*ic,*ics; 1189 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1190 PetscInt i,j,k,nz,nzL,row,*pj; 1191 const PetscInt *ajtmp,*bjtmp; 1192 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1193 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1194 FactorShiftCtx sctx; 1195 const PetscInt *ddiag; 1196 PetscReal rs; 1197 MatScalar d; 1198 PetscInt inod,nodesz,node_max,col; 1199 const PetscInt *ns; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1216 if (rs>sctx.shift_top) sctx.shift_top = rs; 1217 } 1218 sctx.shift_top *= 1.1; 1219 sctx.nshift_max = 5; 1220 sctx.shift_lo = 0.; 1221 sctx.shift_hi = 1.; 1222 } 1223 1224 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1225 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1226 1227 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1228 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1229 rtmp2 = rtmp1 + n; 1230 rtmp3 = rtmp2 + n; 1231 rtmp4 = rtmp3 + n; 1232 ics = ic; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1237 1238 /* If max inode size > 4, split it into two inodes.*/ 1239 /* also map the inode sizes according to the ordering */ 1240 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1241 for (i=0,j=0; i<node_max; ++i,++j) { 1242 if (ns[i] > 4) { 1243 tmp_vec1[j] = 4; 1244 ++j; 1245 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1246 } else { 1247 tmp_vec1[j] = ns[i]; 1248 } 1249 } 1250 /* Use the correct node_max */ 1251 node_max = j; 1252 1253 /* Now reorder the inode info based on mat re-ordering info */ 1254 /* First create a row -> inode_size_array_index map */ 1255 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1256 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1257 for (i=0,row=0; i<node_max; i++) { 1258 nodesz = tmp_vec1[i]; 1259 for (j=0; j<nodesz; j++,row++) { 1260 nsmap[row] = i; 1261 } 1262 } 1263 /* Using nsmap, create a reordered ns structure */ 1264 for (i=0,j=0; i< node_max; i++) { 1265 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1266 tmp_vec2[i] = nodesz; 1267 j += nodesz; 1268 } 1269 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1270 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1271 1272 /* Now use the correct ns */ 1273 ns = tmp_vec2; 1274 1275 do { 1276 sctx.newshift = PETSC_FALSE; 1277 /* Now loop over each block-row, and do the factorization */ 1278 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1279 nodesz = ns[inod]; 1280 1281 switch (nodesz) { 1282 case 1: 1283 /*----------*/ 1284 /* zero rtmp1 */ 1285 /* L part */ 1286 nz = bi[i+1] - bi[i]; 1287 bjtmp = bj + bi[i]; 1288 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1289 1290 /* U part */ 1291 nz = bdiag[i]-bdiag[i+1]; 1292 bjtmp = bj + bdiag[i+1]+1; 1293 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1294 1295 /* load in initial (unfactored row) */ 1296 nz = ai[r[i]+1] - ai[r[i]]; 1297 ajtmp = aj + ai[r[i]]; 1298 v = aa + ai[r[i]]; 1299 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1300 1301 /* ZeropivotApply() */ 1302 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1303 1304 /* elimination */ 1305 bjtmp = bj + bi[i]; 1306 row = *bjtmp++; 1307 nzL = bi[i+1] - bi[i]; 1308 for (k=0; k < nzL; k++) { 1309 pc = rtmp1 + row; 1310 if (*pc != 0.0) { 1311 pv = b->a + bdiag[row]; 1312 mul1 = *pc * (*pv); 1313 *pc = mul1; 1314 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1315 pv = b->a + bdiag[row+1]+1; 1316 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1317 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1318 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1319 } 1320 row = *bjtmp++; 1321 } 1322 1323 /* finished row so stick it into b->a */ 1324 rs = 0.0; 1325 /* L part */ 1326 pv = b->a + bi[i]; 1327 pj = b->j + bi[i]; 1328 nz = bi[i+1] - bi[i]; 1329 for (j=0; j<nz; j++) { 1330 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1331 } 1332 1333 /* U part */ 1334 pv = b->a + bdiag[i+1]+1; 1335 pj = b->j + bdiag[i+1]+1; 1336 nz = bdiag[i] - bdiag[i+1]-1; 1337 for (j=0; j<nz; j++) { 1338 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1339 } 1340 1341 /* Check zero pivot */ 1342 sctx.rs = rs; 1343 sctx.pv = rtmp1[i]; 1344 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1345 if (sctx.newshift) break; 1346 1347 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1348 pv = b->a + bdiag[i]; 1349 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1350 break; 1351 1352 case 2: 1353 /*----------*/ 1354 /* zero rtmp1 and rtmp2 */ 1355 /* L part */ 1356 nz = bi[i+1] - bi[i]; 1357 bjtmp = bj + bi[i]; 1358 for (j=0; j<nz; j++) { 1359 col = bjtmp[j]; 1360 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1361 } 1362 1363 /* U part */ 1364 nz = bdiag[i]-bdiag[i+1]; 1365 bjtmp = bj + bdiag[i+1]+1; 1366 for (j=0; j<nz; j++) { 1367 col = bjtmp[j]; 1368 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1369 } 1370 1371 /* load in initial (unfactored row) */ 1372 nz = ai[r[i]+1] - ai[r[i]]; 1373 ajtmp = aj + ai[r[i]]; 1374 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1375 for (j=0; j<nz; j++) { 1376 col = ics[ajtmp[j]]; 1377 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1378 } 1379 /* ZeropivotApply(): shift the diagonal of the matrix */ 1380 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1381 1382 /* elimination */ 1383 bjtmp = bj + bi[i]; 1384 row = *bjtmp++; /* pivot row */ 1385 nzL = bi[i+1] - bi[i]; 1386 for (k=0; k < nzL; k++) { 1387 pc1 = rtmp1 + row; 1388 pc2 = rtmp2 + row; 1389 if (*pc1 != 0.0 || *pc2 != 0.0) { 1390 pv = b->a + bdiag[row]; 1391 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1392 *pc1 = mul1; *pc2 = mul2; 1393 1394 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1395 pv = b->a + bdiag[row+1]+1; 1396 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1397 for (j=0; j<nz; j++) { 1398 col = pj[j]; 1399 rtmp1[col] -= mul1 * pv[j]; 1400 rtmp2[col] -= mul2 * pv[j]; 1401 } 1402 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1403 } 1404 row = *bjtmp++; 1405 } 1406 1407 /* finished row i; check zero pivot, then stick row i into b->a */ 1408 rs = 0.0; 1409 /* L part */ 1410 pc1 = b->a + bi[i]; 1411 pj = b->j + bi[i]; 1412 nz = bi[i+1] - bi[i]; 1413 for (j=0; j<nz; j++) { 1414 col = pj[j]; 1415 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1416 } 1417 /* U part */ 1418 pc1 = b->a + bdiag[i+1]+1; 1419 pj = b->j + bdiag[i+1]+1; 1420 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1421 for (j=0; j<nz; j++) { 1422 col = pj[j]; 1423 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1424 } 1425 1426 sctx.rs = rs; 1427 sctx.pv = rtmp1[i]; 1428 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1429 if (sctx.newshift) break; 1430 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1431 *pc1 = 1.0/sctx.pv; 1432 1433 /* Now take care of diagonal 2x2 block. */ 1434 pc2 = rtmp2 + i; 1435 if (*pc2 != 0.0) { 1436 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1437 *pc2 = mul1; /* insert L entry */ 1438 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1439 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1442 } 1443 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1444 } 1445 1446 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1447 rs = 0.0; 1448 /* L part */ 1449 pc2 = b->a + bi[i+1]; 1450 pj = b->j + bi[i+1]; 1451 nz = bi[i+2] - bi[i+1]; 1452 for (j=0; j<nz; j++) { 1453 col = pj[j]; 1454 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1455 } 1456 /* U part */ 1457 pc2 = b->a + bdiag[i+2]+1; 1458 pj = b->j + bdiag[i+2]+1; 1459 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1460 for (j=0; j<nz; j++) { 1461 col = pj[j]; 1462 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1463 } 1464 1465 sctx.rs = rs; 1466 sctx.pv = rtmp2[i+1]; 1467 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1468 if (sctx.newshift) break; 1469 pc2 = b->a + bdiag[i+1]; 1470 *pc2 = 1.0/sctx.pv; 1471 break; 1472 1473 case 3: 1474 /*----------*/ 1475 /* zero rtmp */ 1476 /* L part */ 1477 nz = bi[i+1] - bi[i]; 1478 bjtmp = bj + bi[i]; 1479 for (j=0; j<nz; j++) { 1480 col = bjtmp[j]; 1481 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1482 } 1483 1484 /* U part */ 1485 nz = bdiag[i]-bdiag[i+1]; 1486 bjtmp = bj + bdiag[i+1]+1; 1487 for (j=0; j<nz; j++) { 1488 col = bjtmp[j]; 1489 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1490 } 1491 1492 /* load in initial (unfactored row) */ 1493 nz = ai[r[i]+1] - ai[r[i]]; 1494 ajtmp = aj + ai[r[i]]; 1495 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1496 for (j=0; j<nz; j++) { 1497 col = ics[ajtmp[j]]; 1498 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1499 } 1500 /* ZeropivotApply(): shift the diagonal of the matrix */ 1501 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1502 1503 /* elimination */ 1504 bjtmp = bj + bi[i]; 1505 row = *bjtmp++; /* pivot row */ 1506 nzL = bi[i+1] - bi[i]; 1507 for (k=0; k < nzL; k++) { 1508 pc1 = rtmp1 + row; 1509 pc2 = rtmp2 + row; 1510 pc3 = rtmp3 + row; 1511 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1512 pv = b->a + bdiag[row]; 1513 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1514 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1515 1516 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1517 pv = b->a + bdiag[row+1]+1; 1518 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1519 for (j=0; j<nz; j++) { 1520 col = pj[j]; 1521 rtmp1[col] -= mul1 * pv[j]; 1522 rtmp2[col] -= mul2 * pv[j]; 1523 rtmp3[col] -= mul3 * pv[j]; 1524 } 1525 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1526 } 1527 row = *bjtmp++; 1528 } 1529 1530 /* finished row i; check zero pivot, then stick row i into b->a */ 1531 rs = 0.0; 1532 /* L part */ 1533 pc1 = b->a + bi[i]; 1534 pj = b->j + bi[i]; 1535 nz = bi[i+1] - bi[i]; 1536 for (j=0; j<nz; j++) { 1537 col = pj[j]; 1538 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1539 } 1540 /* U part */ 1541 pc1 = b->a + bdiag[i+1]+1; 1542 pj = b->j + bdiag[i+1]+1; 1543 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1544 for (j=0; j<nz; j++) { 1545 col = pj[j]; 1546 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1547 } 1548 1549 sctx.rs = rs; 1550 sctx.pv = rtmp1[i]; 1551 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1552 if (sctx.newshift) break; 1553 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1554 *pc1 = 1.0/sctx.pv; 1555 1556 /* Now take care of 1st column of diagonal 3x3 block. */ 1557 pc2 = rtmp2 + i; 1558 pc3 = rtmp3 + i; 1559 if (*pc2 != 0.0 || *pc3 != 0.0) { 1560 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1561 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1562 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1563 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1564 for (j=0; j<nz; j++) { 1565 col = pj[j]; 1566 rtmp2[col] -= mul2 * rtmp1[col]; 1567 rtmp3[col] -= mul3 * rtmp1[col]; 1568 } 1569 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1570 } 1571 1572 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1573 rs = 0.0; 1574 /* L part */ 1575 pc2 = b->a + bi[i+1]; 1576 pj = b->j + bi[i+1]; 1577 nz = bi[i+2] - bi[i+1]; 1578 for (j=0; j<nz; j++) { 1579 col = pj[j]; 1580 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1581 } 1582 /* U part */ 1583 pc2 = b->a + bdiag[i+2]+1; 1584 pj = b->j + bdiag[i+2]+1; 1585 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1586 for (j=0; j<nz; j++) { 1587 col = pj[j]; 1588 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1589 } 1590 1591 sctx.rs = rs; 1592 sctx.pv = rtmp2[i+1]; 1593 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1594 if (sctx.newshift) break; 1595 pc2 = b->a + bdiag[i+1]; 1596 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1597 1598 /* Now take care of 2nd column of diagonal 3x3 block. */ 1599 pc3 = rtmp3 + i+1; 1600 if (*pc3 != 0.0) { 1601 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1602 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1603 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1604 for (j=0; j<nz; j++) { 1605 col = pj[j]; 1606 rtmp3[col] -= mul3 * rtmp2[col]; 1607 } 1608 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1609 } 1610 1611 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1612 rs = 0.0; 1613 /* L part */ 1614 pc3 = b->a + bi[i+2]; 1615 pj = b->j + bi[i+2]; 1616 nz = bi[i+3] - bi[i+2]; 1617 for (j=0; j<nz; j++) { 1618 col = pj[j]; 1619 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1620 } 1621 /* U part */ 1622 pc3 = b->a + bdiag[i+3]+1; 1623 pj = b->j + bdiag[i+3]+1; 1624 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1625 for (j=0; j<nz; j++) { 1626 col = pj[j]; 1627 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1628 } 1629 1630 sctx.rs = rs; 1631 sctx.pv = rtmp3[i+2]; 1632 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1633 if (sctx.newshift) break; 1634 pc3 = b->a + bdiag[i+2]; 1635 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1636 break; 1637 case 4: 1638 /*----------*/ 1639 /* zero rtmp */ 1640 /* L part */ 1641 nz = bi[i+1] - bi[i]; 1642 bjtmp = bj + bi[i]; 1643 for (j=0; j<nz; j++) { 1644 col = bjtmp[j]; 1645 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1646 } 1647 1648 /* U part */ 1649 nz = bdiag[i]-bdiag[i+1]; 1650 bjtmp = bj + bdiag[i+1]+1; 1651 for (j=0; j<nz; j++) { 1652 col = bjtmp[j]; 1653 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1654 } 1655 1656 /* load in initial (unfactored row) */ 1657 nz = ai[r[i]+1] - ai[r[i]]; 1658 ajtmp = aj + ai[r[i]]; 1659 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1660 for (j=0; j<nz; j++) { 1661 col = ics[ajtmp[j]]; 1662 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1663 } 1664 /* ZeropivotApply(): shift the diagonal of the matrix */ 1665 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1666 1667 /* elimination */ 1668 bjtmp = bj + bi[i]; 1669 row = *bjtmp++; /* pivot row */ 1670 nzL = bi[i+1] - bi[i]; 1671 for (k=0; k < nzL; k++) { 1672 pc1 = rtmp1 + row; 1673 pc2 = rtmp2 + row; 1674 pc3 = rtmp3 + row; 1675 pc4 = rtmp4 + row; 1676 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1677 pv = b->a + bdiag[row]; 1678 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1679 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1680 1681 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1682 pv = b->a + bdiag[row+1]+1; 1683 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1684 for (j=0; j<nz; j++) { 1685 col = pj[j]; 1686 rtmp1[col] -= mul1 * pv[j]; 1687 rtmp2[col] -= mul2 * pv[j]; 1688 rtmp3[col] -= mul3 * pv[j]; 1689 rtmp4[col] -= mul4 * pv[j]; 1690 } 1691 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1692 } 1693 row = *bjtmp++; 1694 } 1695 1696 /* finished row i; check zero pivot, then stick row i into b->a */ 1697 rs = 0.0; 1698 /* L part */ 1699 pc1 = b->a + bi[i]; 1700 pj = b->j + bi[i]; 1701 nz = bi[i+1] - bi[i]; 1702 for (j=0; j<nz; j++) { 1703 col = pj[j]; 1704 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1705 } 1706 /* U part */ 1707 pc1 = b->a + bdiag[i+1]+1; 1708 pj = b->j + bdiag[i+1]+1; 1709 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1710 for (j=0; j<nz; j++) { 1711 col = pj[j]; 1712 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1713 } 1714 1715 sctx.rs = rs; 1716 sctx.pv = rtmp1[i]; 1717 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1718 if (sctx.newshift) break; 1719 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1720 *pc1 = 1.0/sctx.pv; 1721 1722 /* Now take care of 1st column of diagonal 4x4 block. */ 1723 pc2 = rtmp2 + i; 1724 pc3 = rtmp3 + i; 1725 pc4 = rtmp4 + i; 1726 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1727 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1728 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1729 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1730 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1731 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1732 for (j=0; j<nz; j++) { 1733 col = pj[j]; 1734 rtmp2[col] -= mul2 * rtmp1[col]; 1735 rtmp3[col] -= mul3 * rtmp1[col]; 1736 rtmp4[col] -= mul4 * rtmp1[col]; 1737 } 1738 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1739 } 1740 1741 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1742 rs = 0.0; 1743 /* L part */ 1744 pc2 = b->a + bi[i+1]; 1745 pj = b->j + bi[i+1]; 1746 nz = bi[i+2] - bi[i+1]; 1747 for (j=0; j<nz; j++) { 1748 col = pj[j]; 1749 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1750 } 1751 /* U part */ 1752 pc2 = b->a + bdiag[i+2]+1; 1753 pj = b->j + bdiag[i+2]+1; 1754 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1755 for (j=0; j<nz; j++) { 1756 col = pj[j]; 1757 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1758 } 1759 1760 sctx.rs = rs; 1761 sctx.pv = rtmp2[i+1]; 1762 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1763 if (sctx.newshift) break; 1764 pc2 = b->a + bdiag[i+1]; 1765 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1766 1767 /* Now take care of 2nd column of diagonal 4x4 block. */ 1768 pc3 = rtmp3 + i+1; 1769 pc4 = rtmp4 + i+1; 1770 if (*pc3 != 0.0 || *pc4 != 0.0) { 1771 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1772 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1773 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1774 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1775 for (j=0; j<nz; j++) { 1776 col = pj[j]; 1777 rtmp3[col] -= mul3 * rtmp2[col]; 1778 rtmp4[col] -= mul4 * rtmp2[col]; 1779 } 1780 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1781 } 1782 1783 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1784 rs = 0.0; 1785 /* L part */ 1786 pc3 = b->a + bi[i+2]; 1787 pj = b->j + bi[i+2]; 1788 nz = bi[i+3] - bi[i+2]; 1789 for (j=0; j<nz; j++) { 1790 col = pj[j]; 1791 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1792 } 1793 /* U part */ 1794 pc3 = b->a + bdiag[i+3]+1; 1795 pj = b->j + bdiag[i+3]+1; 1796 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1797 for (j=0; j<nz; j++) { 1798 col = pj[j]; 1799 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1800 } 1801 1802 sctx.rs = rs; 1803 sctx.pv = rtmp3[i+2]; 1804 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1805 if (sctx.newshift) break; 1806 pc3 = b->a + bdiag[i+2]; 1807 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1808 1809 /* Now take care of 3rd column of diagonal 4x4 block. */ 1810 pc4 = rtmp4 + i+2; 1811 if (*pc4 != 0.0) { 1812 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1813 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1814 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1815 for (j=0; j<nz; j++) { 1816 col = pj[j]; 1817 rtmp4[col] -= mul4 * rtmp3[col]; 1818 } 1819 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1820 } 1821 1822 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1823 rs = 0.0; 1824 /* L part */ 1825 pc4 = b->a + bi[i+3]; 1826 pj = b->j + bi[i+3]; 1827 nz = bi[i+4] - bi[i+3]; 1828 for (j=0; j<nz; j++) { 1829 col = pj[j]; 1830 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1831 } 1832 /* U part */ 1833 pc4 = b->a + bdiag[i+4]+1; 1834 pj = b->j + bdiag[i+4]+1; 1835 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1836 for (j=0; j<nz; j++) { 1837 col = pj[j]; 1838 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1839 } 1840 1841 sctx.rs = rs; 1842 sctx.pv = rtmp4[i+3]; 1843 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1844 if (sctx.newshift) break; 1845 pc4 = b->a + bdiag[i+3]; 1846 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1847 break; 1848 1849 default: 1850 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1851 } 1852 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1853 i += nodesz; /* Update the row */ 1854 } 1855 1856 /* MatPivotRefine() */ 1857 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1858 /* 1859 * if no shift in this attempt & shifting & started shifting & can refine, 1860 * then try lower shift 1861 */ 1862 sctx.shift_hi = sctx.shift_fraction; 1863 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1864 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1865 sctx.newshift = PETSC_TRUE; 1866 sctx.nshift++; 1867 } 1868 } while (sctx.newshift); 1869 1870 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1871 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1872 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1873 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1874 1875 C->ops->solve = MatSolve_SeqAIJ; 1876 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1877 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1878 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1879 C->ops->matsolve = MatMatSolve_SeqAIJ; 1880 C->assembled = PETSC_TRUE; 1881 C->preallocated = PETSC_TRUE; 1882 1883 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1884 1885 /* MatShiftView(A,info,&sctx) */ 1886 if (sctx.nshift) { 1887 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1888 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1889 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1890 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1891 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1892 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1893 } 1894 } 1895 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1896 PetscFunctionReturn(0); 1897 } 1898 1899 #undef __FUNCT__ 1900 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1901 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1902 { 1903 Mat C = B; 1904 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1905 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1906 PetscErrorCode ierr; 1907 const PetscInt *r,*ic,*c,*ics; 1908 PetscInt n = A->rmap->n,*bi = b->i; 1909 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1910 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1911 PetscInt *ai = a->i,*aj = a->j; 1912 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1913 PetscScalar mul1,mul2,mul3,tmp; 1914 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1915 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1916 PetscReal rs=0.0; 1917 FactorShiftCtx sctx; 1918 1919 PetscFunctionBegin; 1920 sctx.shift_top = 0; 1921 sctx.nshift_max = 0; 1922 sctx.shift_lo = 0; 1923 sctx.shift_hi = 0; 1924 sctx.shift_fraction = 0; 1925 1926 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1927 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1928 sctx.shift_top = 0; 1929 for (i=0; i<n; i++) { 1930 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1931 rs = 0.0; 1932 ajtmp = aj + ai[i]; 1933 rtmp1 = aa + ai[i]; 1934 nz = ai[i+1] - ai[i]; 1935 for (j=0; j<nz; j++) { 1936 if (*ajtmp != i) { 1937 rs += PetscAbsScalar(*rtmp1++); 1938 } else { 1939 rs -= PetscRealPart(*rtmp1++); 1940 } 1941 ajtmp++; 1942 } 1943 if (rs>sctx.shift_top) sctx.shift_top = rs; 1944 } 1945 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1946 sctx.shift_top *= 1.1; 1947 sctx.nshift_max = 5; 1948 sctx.shift_lo = 0.; 1949 sctx.shift_hi = 1.; 1950 } 1951 sctx.shift_amount = 0; 1952 sctx.nshift = 0; 1953 1954 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1955 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1956 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1957 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1958 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1959 ics = ic; 1960 rtmp22 = rtmp11 + n; 1961 rtmp33 = rtmp22 + n; 1962 1963 node_max = a->inode.node_count; 1964 ns = a->inode.size; 1965 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1966 1967 /* If max inode size > 3, split it into two inodes.*/ 1968 /* also map the inode sizes according to the ordering */ 1969 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1970 for (i=0,j=0; i<node_max; ++i,++j) { 1971 if (ns[i]>3) { 1972 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1973 ++j; 1974 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1975 } else { 1976 tmp_vec1[j] = ns[i]; 1977 } 1978 } 1979 /* Use the correct node_max */ 1980 node_max = j; 1981 1982 /* Now reorder the inode info based on mat re-ordering info */ 1983 /* First create a row -> inode_size_array_index map */ 1984 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1985 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1986 for (i=0,row=0; i<node_max; i++) { 1987 nodesz = tmp_vec1[i]; 1988 for (j=0; j<nodesz; j++,row++) { 1989 nsmap[row] = i; 1990 } 1991 } 1992 /* Using nsmap, create a reordered ns structure */ 1993 for (i=0,j=0; i< node_max; i++) { 1994 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1995 tmp_vec2[i] = nodesz; 1996 j += nodesz; 1997 } 1998 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1999 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2000 /* Now use the correct ns */ 2001 ns = tmp_vec2; 2002 2003 do { 2004 sctx.newshift = PETSC_FALSE; 2005 /* Now loop over each block-row, and do the factorization */ 2006 for (i=0,row=0; i<node_max; i++) { 2007 nodesz = ns[i]; 2008 nz = bi[row+1] - bi[row]; 2009 bjtmp = bj + bi[row]; 2010 2011 switch (nodesz) { 2012 case 1: 2013 for (j=0; j<nz; j++) { 2014 idx = bjtmp[j]; 2015 rtmp11[idx] = 0.0; 2016 } 2017 2018 /* load in initial (unfactored row) */ 2019 idx = r[row]; 2020 nz_tmp = ai[idx+1] - ai[idx]; 2021 ajtmp = aj + ai[idx]; 2022 v1 = aa + ai[idx]; 2023 2024 for (j=0; j<nz_tmp; j++) { 2025 idx = ics[ajtmp[j]]; 2026 rtmp11[idx] = v1[j]; 2027 } 2028 rtmp11[ics[r[row]]] += sctx.shift_amount; 2029 2030 prow = *bjtmp++; 2031 while (prow < row) { 2032 pc1 = rtmp11 + prow; 2033 if (*pc1 != 0.0) { 2034 pv = ba + bd[prow]; 2035 pj = nbj + bd[prow]; 2036 mul1 = *pc1 * *pv++; 2037 *pc1 = mul1; 2038 nz_tmp = bi[prow+1] - bd[prow] - 1; 2039 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2040 for (j=0; j<nz_tmp; j++) { 2041 tmp = pv[j]; 2042 idx = pj[j]; 2043 rtmp11[idx] -= mul1 * tmp; 2044 } 2045 } 2046 prow = *bjtmp++; 2047 } 2048 pj = bj + bi[row]; 2049 pc1 = ba + bi[row]; 2050 2051 sctx.pv = rtmp11[row]; 2052 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2053 rs = 0.0; 2054 for (j=0; j<nz; j++) { 2055 idx = pj[j]; 2056 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2057 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2058 } 2059 sctx.rs = rs; 2060 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2061 if (sctx.newshift) goto endofwhile; 2062 break; 2063 2064 case 2: 2065 for (j=0; j<nz; j++) { 2066 idx = bjtmp[j]; 2067 rtmp11[idx] = 0.0; 2068 rtmp22[idx] = 0.0; 2069 } 2070 2071 /* load in initial (unfactored row) */ 2072 idx = r[row]; 2073 nz_tmp = ai[idx+1] - ai[idx]; 2074 ajtmp = aj + ai[idx]; 2075 v1 = aa + ai[idx]; 2076 v2 = aa + ai[idx+1]; 2077 for (j=0; j<nz_tmp; j++) { 2078 idx = ics[ajtmp[j]]; 2079 rtmp11[idx] = v1[j]; 2080 rtmp22[idx] = v2[j]; 2081 } 2082 rtmp11[ics[r[row]]] += sctx.shift_amount; 2083 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2084 2085 prow = *bjtmp++; 2086 while (prow < row) { 2087 pc1 = rtmp11 + prow; 2088 pc2 = rtmp22 + prow; 2089 if (*pc1 != 0.0 || *pc2 != 0.0) { 2090 pv = ba + bd[prow]; 2091 pj = nbj + bd[prow]; 2092 mul1 = *pc1 * *pv; 2093 mul2 = *pc2 * *pv; 2094 ++pv; 2095 *pc1 = mul1; 2096 *pc2 = mul2; 2097 2098 nz_tmp = bi[prow+1] - bd[prow] - 1; 2099 for (j=0; j<nz_tmp; j++) { 2100 tmp = pv[j]; 2101 idx = pj[j]; 2102 rtmp11[idx] -= mul1 * tmp; 2103 rtmp22[idx] -= mul2 * tmp; 2104 } 2105 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2106 } 2107 prow = *bjtmp++; 2108 } 2109 2110 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2111 pc1 = rtmp11 + prow; 2112 pc2 = rtmp22 + prow; 2113 2114 sctx.pv = *pc1; 2115 pj = bj + bi[prow]; 2116 rs = 0.0; 2117 for (j=0; j<nz; j++) { 2118 idx = pj[j]; 2119 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2120 } 2121 sctx.rs = rs; 2122 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2123 if (sctx.newshift) goto endofwhile; 2124 2125 if (*pc2 != 0.0) { 2126 pj = nbj + bd[prow]; 2127 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2128 *pc2 = mul2; 2129 nz_tmp = bi[prow+1] - bd[prow] - 1; 2130 for (j=0; j<nz_tmp; j++) { 2131 idx = pj[j]; 2132 tmp = rtmp11[idx]; 2133 rtmp22[idx] -= mul2 * tmp; 2134 } 2135 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2136 } 2137 2138 pj = bj + bi[row]; 2139 pc1 = ba + bi[row]; 2140 pc2 = ba + bi[row+1]; 2141 2142 sctx.pv = rtmp22[row+1]; 2143 rs = 0.0; 2144 rtmp11[row] = 1.0/rtmp11[row]; 2145 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2146 /* copy row entries from dense representation to sparse */ 2147 for (j=0; j<nz; j++) { 2148 idx = pj[j]; 2149 pc1[j] = rtmp11[idx]; 2150 pc2[j] = rtmp22[idx]; 2151 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2152 } 2153 sctx.rs = rs; 2154 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2155 if (sctx.newshift) goto endofwhile; 2156 break; 2157 2158 case 3: 2159 for (j=0; j<nz; j++) { 2160 idx = bjtmp[j]; 2161 rtmp11[idx] = 0.0; 2162 rtmp22[idx] = 0.0; 2163 rtmp33[idx] = 0.0; 2164 } 2165 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2166 idx = r[row]; 2167 nz_tmp = ai[idx+1] - ai[idx]; 2168 ajtmp = aj + ai[idx]; 2169 v1 = aa + ai[idx]; 2170 v2 = aa + ai[idx+1]; 2171 v3 = aa + ai[idx+2]; 2172 for (j=0; j<nz_tmp; j++) { 2173 idx = ics[ajtmp[j]]; 2174 rtmp11[idx] = v1[j]; 2175 rtmp22[idx] = v2[j]; 2176 rtmp33[idx] = v3[j]; 2177 } 2178 rtmp11[ics[r[row]]] += sctx.shift_amount; 2179 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2180 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2181 2182 /* loop over all pivot row blocks above this row block */ 2183 prow = *bjtmp++; 2184 while (prow < row) { 2185 pc1 = rtmp11 + prow; 2186 pc2 = rtmp22 + prow; 2187 pc3 = rtmp33 + prow; 2188 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2189 pv = ba + bd[prow]; 2190 pj = nbj + bd[prow]; 2191 mul1 = *pc1 * *pv; 2192 mul2 = *pc2 * *pv; 2193 mul3 = *pc3 * *pv; 2194 ++pv; 2195 *pc1 = mul1; 2196 *pc2 = mul2; 2197 *pc3 = mul3; 2198 2199 nz_tmp = bi[prow+1] - bd[prow] - 1; 2200 /* update this row based on pivot row */ 2201 for (j=0; j<nz_tmp; j++) { 2202 tmp = pv[j]; 2203 idx = pj[j]; 2204 rtmp11[idx] -= mul1 * tmp; 2205 rtmp22[idx] -= mul2 * tmp; 2206 rtmp33[idx] -= mul3 * tmp; 2207 } 2208 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2209 } 2210 prow = *bjtmp++; 2211 } 2212 2213 /* Now take care of diagonal 3x3 block in this set of rows */ 2214 /* note: prow = row here */ 2215 pc1 = rtmp11 + prow; 2216 pc2 = rtmp22 + prow; 2217 pc3 = rtmp33 + prow; 2218 2219 sctx.pv = *pc1; 2220 pj = bj + bi[prow]; 2221 rs = 0.0; 2222 for (j=0; j<nz; j++) { 2223 idx = pj[j]; 2224 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2225 } 2226 sctx.rs = rs; 2227 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2228 if (sctx.newshift) goto endofwhile; 2229 2230 if (*pc2 != 0.0 || *pc3 != 0.0) { 2231 mul2 = (*pc2)/(*pc1); 2232 mul3 = (*pc3)/(*pc1); 2233 *pc2 = mul2; 2234 *pc3 = mul3; 2235 nz_tmp = bi[prow+1] - bd[prow] - 1; 2236 pj = nbj + bd[prow]; 2237 for (j=0; j<nz_tmp; j++) { 2238 idx = pj[j]; 2239 tmp = rtmp11[idx]; 2240 rtmp22[idx] -= mul2 * tmp; 2241 rtmp33[idx] -= mul3 * tmp; 2242 } 2243 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2244 } 2245 ++prow; 2246 2247 pc2 = rtmp22 + prow; 2248 pc3 = rtmp33 + prow; 2249 sctx.pv = *pc2; 2250 pj = bj + bi[prow]; 2251 rs = 0.0; 2252 for (j=0; j<nz; j++) { 2253 idx = pj[j]; 2254 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2255 } 2256 sctx.rs = rs; 2257 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2258 if (sctx.newshift) goto endofwhile; 2259 2260 if (*pc3 != 0.0) { 2261 mul3 = (*pc3)/(*pc2); 2262 *pc3 = mul3; 2263 pj = nbj + bd[prow]; 2264 nz_tmp = bi[prow+1] - bd[prow] - 1; 2265 for (j=0; j<nz_tmp; j++) { 2266 idx = pj[j]; 2267 tmp = rtmp22[idx]; 2268 rtmp33[idx] -= mul3 * tmp; 2269 } 2270 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2271 } 2272 2273 pj = bj + bi[row]; 2274 pc1 = ba + bi[row]; 2275 pc2 = ba + bi[row+1]; 2276 pc3 = ba + bi[row+2]; 2277 2278 sctx.pv = rtmp33[row+2]; 2279 rs = 0.0; 2280 rtmp11[row] = 1.0/rtmp11[row]; 2281 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2282 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2283 /* copy row entries from dense representation to sparse */ 2284 for (j=0; j<nz; j++) { 2285 idx = pj[j]; 2286 pc1[j] = rtmp11[idx]; 2287 pc2[j] = rtmp22[idx]; 2288 pc3[j] = rtmp33[idx]; 2289 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2290 } 2291 2292 sctx.rs = rs; 2293 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2294 if (sctx.newshift) goto endofwhile; 2295 break; 2296 2297 default: 2298 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2299 } 2300 row += nodesz; /* Update the row */ 2301 } 2302 endofwhile:; 2303 } while (sctx.newshift); 2304 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2305 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2306 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2307 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2308 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2309 2310 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2311 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2312 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2313 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2314 C->assembled = PETSC_TRUE; 2315 C->preallocated = PETSC_TRUE; 2316 if (sctx.nshift) { 2317 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2318 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2319 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2320 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2321 } 2322 } 2323 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2324 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2325 PetscFunctionReturn(0); 2326 } 2327 2328 2329 /* ----------------------------------------------------------- */ 2330 #undef __FUNCT__ 2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2333 { 2334 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2335 IS iscol = a->col,isrow = a->row; 2336 PetscErrorCode ierr; 2337 const PetscInt *r,*c,*rout,*cout; 2338 PetscInt i,j,n = A->rmap->n; 2339 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2340 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2341 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2342 PetscScalar sum1,sum2,sum3,sum4,sum5; 2343 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2344 const PetscScalar *b; 2345 2346 PetscFunctionBegin; 2347 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2348 node_max = a->inode.node_count; 2349 ns = a->inode.size; /* Node Size array */ 2350 2351 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2352 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2353 tmp = a->solve_work; 2354 2355 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2356 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2357 2358 /* forward solve the lower triangular */ 2359 tmps = tmp; 2360 aa = a_a; 2361 aj = a_j; 2362 ad = a->diag; 2363 2364 for (i = 0,row = 0; i< node_max; ++i) { 2365 nsz = ns[i]; 2366 aii = ai[row]; 2367 v1 = aa + aii; 2368 vi = aj + aii; 2369 nz = ai[row+1]- ai[row]; 2370 2371 if (i < node_max-1) { 2372 /* Prefetch the indices for the next block */ 2373 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2374 /* Prefetch the data for the next block */ 2375 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2376 } 2377 2378 switch (nsz) { /* Each loop in 'case' is unrolled */ 2379 case 1: 2380 sum1 = b[r[row]]; 2381 for (j=0; j<nz-1; j+=2) { 2382 i0 = vi[j]; 2383 i1 = vi[j+1]; 2384 tmp0 = tmps[i0]; 2385 tmp1 = tmps[i1]; 2386 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2387 } 2388 if (j == nz-1) { 2389 tmp0 = tmps[vi[j]]; 2390 sum1 -= v1[j]*tmp0; 2391 } 2392 tmp[row++]=sum1; 2393 break; 2394 case 2: 2395 sum1 = b[r[row]]; 2396 sum2 = b[r[row+1]]; 2397 v2 = aa + ai[row+1]; 2398 2399 for (j=0; j<nz-1; j+=2) { 2400 i0 = vi[j]; 2401 i1 = vi[j+1]; 2402 tmp0 = tmps[i0]; 2403 tmp1 = tmps[i1]; 2404 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2405 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2406 } 2407 if (j == nz-1) { 2408 tmp0 = tmps[vi[j]]; 2409 sum1 -= v1[j] *tmp0; 2410 sum2 -= v2[j] *tmp0; 2411 } 2412 sum2 -= v2[nz] * sum1; 2413 tmp[row++]=sum1; 2414 tmp[row++]=sum2; 2415 break; 2416 case 3: 2417 sum1 = b[r[row]]; 2418 sum2 = b[r[row+1]]; 2419 sum3 = b[r[row+2]]; 2420 v2 = aa + ai[row+1]; 2421 v3 = aa + ai[row+2]; 2422 2423 for (j=0; j<nz-1; j+=2) { 2424 i0 = vi[j]; 2425 i1 = vi[j+1]; 2426 tmp0 = tmps[i0]; 2427 tmp1 = tmps[i1]; 2428 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2429 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2430 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2431 } 2432 if (j == nz-1) { 2433 tmp0 = tmps[vi[j]]; 2434 sum1 -= v1[j] *tmp0; 2435 sum2 -= v2[j] *tmp0; 2436 sum3 -= v3[j] *tmp0; 2437 } 2438 sum2 -= v2[nz] * sum1; 2439 sum3 -= v3[nz] * sum1; 2440 sum3 -= v3[nz+1] * sum2; 2441 tmp[row++]=sum1; 2442 tmp[row++]=sum2; 2443 tmp[row++]=sum3; 2444 break; 2445 2446 case 4: 2447 sum1 = b[r[row]]; 2448 sum2 = b[r[row+1]]; 2449 sum3 = b[r[row+2]]; 2450 sum4 = b[r[row+3]]; 2451 v2 = aa + ai[row+1]; 2452 v3 = aa + ai[row+2]; 2453 v4 = aa + ai[row+3]; 2454 2455 for (j=0; j<nz-1; j+=2) { 2456 i0 = vi[j]; 2457 i1 = vi[j+1]; 2458 tmp0 = tmps[i0]; 2459 tmp1 = tmps[i1]; 2460 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2461 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2462 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2463 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2464 } 2465 if (j == nz-1) { 2466 tmp0 = tmps[vi[j]]; 2467 sum1 -= v1[j] *tmp0; 2468 sum2 -= v2[j] *tmp0; 2469 sum3 -= v3[j] *tmp0; 2470 sum4 -= v4[j] *tmp0; 2471 } 2472 sum2 -= v2[nz] * sum1; 2473 sum3 -= v3[nz] * sum1; 2474 sum4 -= v4[nz] * sum1; 2475 sum3 -= v3[nz+1] * sum2; 2476 sum4 -= v4[nz+1] * sum2; 2477 sum4 -= v4[nz+2] * sum3; 2478 2479 tmp[row++]=sum1; 2480 tmp[row++]=sum2; 2481 tmp[row++]=sum3; 2482 tmp[row++]=sum4; 2483 break; 2484 case 5: 2485 sum1 = b[r[row]]; 2486 sum2 = b[r[row+1]]; 2487 sum3 = b[r[row+2]]; 2488 sum4 = b[r[row+3]]; 2489 sum5 = b[r[row+4]]; 2490 v2 = aa + ai[row+1]; 2491 v3 = aa + ai[row+2]; 2492 v4 = aa + ai[row+3]; 2493 v5 = aa + ai[row+4]; 2494 2495 for (j=0; j<nz-1; j+=2) { 2496 i0 = vi[j]; 2497 i1 = vi[j+1]; 2498 tmp0 = tmps[i0]; 2499 tmp1 = tmps[i1]; 2500 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2501 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2502 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2503 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2504 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2505 } 2506 if (j == nz-1) { 2507 tmp0 = tmps[vi[j]]; 2508 sum1 -= v1[j] *tmp0; 2509 sum2 -= v2[j] *tmp0; 2510 sum3 -= v3[j] *tmp0; 2511 sum4 -= v4[j] *tmp0; 2512 sum5 -= v5[j] *tmp0; 2513 } 2514 2515 sum2 -= v2[nz] * sum1; 2516 sum3 -= v3[nz] * sum1; 2517 sum4 -= v4[nz] * sum1; 2518 sum5 -= v5[nz] * sum1; 2519 sum3 -= v3[nz+1] * sum2; 2520 sum4 -= v4[nz+1] * sum2; 2521 sum5 -= v5[nz+1] * sum2; 2522 sum4 -= v4[nz+2] * sum3; 2523 sum5 -= v5[nz+2] * sum3; 2524 sum5 -= v5[nz+3] * sum4; 2525 2526 tmp[row++]=sum1; 2527 tmp[row++]=sum2; 2528 tmp[row++]=sum3; 2529 tmp[row++]=sum4; 2530 tmp[row++]=sum5; 2531 break; 2532 default: 2533 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2534 } 2535 } 2536 /* backward solve the upper triangular */ 2537 for (i=node_max -1,row = n-1; i>=0; i--) { 2538 nsz = ns[i]; 2539 aii = ad[row+1] + 1; 2540 v1 = aa + aii; 2541 vi = aj + aii; 2542 nz = ad[row]- ad[row+1] - 1; 2543 2544 if (i > 0) { 2545 /* Prefetch the indices for the next block */ 2546 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2547 /* Prefetch the data for the next block */ 2548 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2549 } 2550 2551 switch (nsz) { /* Each loop in 'case' is unrolled */ 2552 case 1: 2553 sum1 = tmp[row]; 2554 2555 for (j=0; j<nz-1; j+=2) { 2556 i0 = vi[j]; 2557 i1 = vi[j+1]; 2558 tmp0 = tmps[i0]; 2559 tmp1 = tmps[i1]; 2560 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2561 } 2562 if (j == nz-1) { 2563 tmp0 = tmps[vi[j]]; 2564 sum1 -= v1[j]*tmp0; 2565 } 2566 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2567 break; 2568 case 2: 2569 sum1 = tmp[row]; 2570 sum2 = tmp[row-1]; 2571 v2 = aa + ad[row] + 1; 2572 for (j=0; j<nz-1; j+=2) { 2573 i0 = vi[j]; 2574 i1 = vi[j+1]; 2575 tmp0 = tmps[i0]; 2576 tmp1 = tmps[i1]; 2577 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2578 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2579 } 2580 if (j == nz-1) { 2581 tmp0 = tmps[vi[j]]; 2582 sum1 -= v1[j]* tmp0; 2583 sum2 -= v2[j+1]* tmp0; 2584 } 2585 2586 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2587 sum2 -= v2[0] * tmp0; 2588 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2589 break; 2590 case 3: 2591 sum1 = tmp[row]; 2592 sum2 = tmp[row -1]; 2593 sum3 = tmp[row -2]; 2594 v2 = aa + ad[row] + 1; 2595 v3 = aa + ad[row -1] + 1; 2596 for (j=0; j<nz-1; j+=2) { 2597 i0 = vi[j]; 2598 i1 = vi[j+1]; 2599 tmp0 = tmps[i0]; 2600 tmp1 = tmps[i1]; 2601 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2602 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2603 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2604 } 2605 if (j== nz-1) { 2606 tmp0 = tmps[vi[j]]; 2607 sum1 -= v1[j] * tmp0; 2608 sum2 -= v2[j+1] * tmp0; 2609 sum3 -= v3[j+2] * tmp0; 2610 } 2611 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2612 sum2 -= v2[0]* tmp0; 2613 sum3 -= v3[1] * tmp0; 2614 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2615 sum3 -= v3[0]* tmp0; 2616 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2617 2618 break; 2619 case 4: 2620 sum1 = tmp[row]; 2621 sum2 = tmp[row -1]; 2622 sum3 = tmp[row -2]; 2623 sum4 = tmp[row -3]; 2624 v2 = aa + ad[row]+1; 2625 v3 = aa + ad[row -1]+1; 2626 v4 = aa + ad[row -2]+1; 2627 2628 for (j=0; j<nz-1; j+=2) { 2629 i0 = vi[j]; 2630 i1 = vi[j+1]; 2631 tmp0 = tmps[i0]; 2632 tmp1 = tmps[i1]; 2633 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2634 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2635 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2636 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2637 } 2638 if (j== nz-1) { 2639 tmp0 = tmps[vi[j]]; 2640 sum1 -= v1[j] * tmp0; 2641 sum2 -= v2[j+1] * tmp0; 2642 sum3 -= v3[j+2] * tmp0; 2643 sum4 -= v4[j+3] * tmp0; 2644 } 2645 2646 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2647 sum2 -= v2[0] * tmp0; 2648 sum3 -= v3[1] * tmp0; 2649 sum4 -= v4[2] * tmp0; 2650 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2651 sum3 -= v3[0] * tmp0; 2652 sum4 -= v4[1] * tmp0; 2653 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2654 sum4 -= v4[0] * tmp0; 2655 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2656 break; 2657 case 5: 2658 sum1 = tmp[row]; 2659 sum2 = tmp[row -1]; 2660 sum3 = tmp[row -2]; 2661 sum4 = tmp[row -3]; 2662 sum5 = tmp[row -4]; 2663 v2 = aa + ad[row]+1; 2664 v3 = aa + ad[row -1]+1; 2665 v4 = aa + ad[row -2]+1; 2666 v5 = aa + ad[row -3]+1; 2667 for (j=0; j<nz-1; j+=2) { 2668 i0 = vi[j]; 2669 i1 = vi[j+1]; 2670 tmp0 = tmps[i0]; 2671 tmp1 = tmps[i1]; 2672 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2673 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2674 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2675 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2676 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2677 } 2678 if (j==nz-1) { 2679 tmp0 = tmps[vi[j]]; 2680 sum1 -= v1[j] * tmp0; 2681 sum2 -= v2[j+1] * tmp0; 2682 sum3 -= v3[j+2] * tmp0; 2683 sum4 -= v4[j+3] * tmp0; 2684 sum5 -= v5[j+4] * tmp0; 2685 } 2686 2687 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2688 sum2 -= v2[0] * tmp0; 2689 sum3 -= v3[1] * tmp0; 2690 sum4 -= v4[2] * tmp0; 2691 sum5 -= v5[3] * tmp0; 2692 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2693 sum3 -= v3[0] * tmp0; 2694 sum4 -= v4[1] * tmp0; 2695 sum5 -= v5[2] * tmp0; 2696 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2697 sum4 -= v4[0] * tmp0; 2698 sum5 -= v5[1] * tmp0; 2699 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2700 sum5 -= v5[0] * tmp0; 2701 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2702 break; 2703 default: 2704 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2705 } 2706 } 2707 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2708 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2709 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2711 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2712 PetscFunctionReturn(0); 2713 } 2714 2715 2716 /* 2717 Makes a longer coloring[] array and calls the usual code with that 2718 */ 2719 #undef __FUNCT__ 2720 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2721 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2722 { 2723 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2724 PetscErrorCode ierr; 2725 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2726 PetscInt *colorused,i; 2727 ISColoringValue *newcolor; 2728 2729 PetscFunctionBegin; 2730 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2731 /* loop over inodes, marking a color for each column*/ 2732 row = 0; 2733 for (i=0; i<m; i++) { 2734 for (j=0; j<ns[i]; j++) { 2735 newcolor[row++] = coloring[i] + j*ncolors; 2736 } 2737 } 2738 2739 /* eliminate unneeded colors */ 2740 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2741 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2742 for (i=0; i<n; i++) { 2743 colorused[newcolor[i]] = 1; 2744 } 2745 2746 for (i=1; i<5*ncolors; i++) { 2747 colorused[i] += colorused[i-1]; 2748 } 2749 ncolors = colorused[5*ncolors-1]; 2750 for (i=0; i<n; i++) { 2751 newcolor[i] = colorused[newcolor[i]]-1; 2752 } 2753 ierr = PetscFree(colorused);CHKERRQ(ierr); 2754 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2755 ierr = PetscFree(coloring);CHKERRQ(ierr); 2756 PetscFunctionReturn(0); 2757 } 2758 2759 #include <petsc-private/kernels/blockinvert.h> 2760 2761 #undef __FUNCT__ 2762 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2763 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2764 { 2765 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2766 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2767 MatScalar *ibdiag,*bdiag,work[25],*t; 2768 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2769 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2770 const PetscScalar *xb, *b; 2771 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2772 PetscErrorCode ierr; 2773 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2774 PetscInt sz,k,ipvt[5]; 2775 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2776 2777 PetscFunctionBegin; 2778 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2779 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2780 2781 if (!a->inode.ibdiagvalid) { 2782 if (!a->inode.ibdiag) { 2783 /* calculate space needed for diagonal blocks */ 2784 for (i=0; i<m; i++) { 2785 cnt += sizes[i]*sizes[i]; 2786 } 2787 a->inode.bdiagsize = cnt; 2788 2789 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2790 } 2791 2792 /* copy over the diagonal blocks and invert them */ 2793 ibdiag = a->inode.ibdiag; 2794 bdiag = a->inode.bdiag; 2795 cnt = 0; 2796 for (i=0, row = 0; i<m; i++) { 2797 for (j=0; j<sizes[i]; j++) { 2798 for (k=0; k<sizes[i]; k++) { 2799 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2800 } 2801 } 2802 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2803 2804 switch (sizes[i]) { 2805 case 1: 2806 /* Create matrix data structure */ 2807 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2808 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2809 break; 2810 case 2: 2811 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2812 break; 2813 case 3: 2814 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2815 break; 2816 case 4: 2817 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2818 break; 2819 case 5: 2820 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2821 break; 2822 default: 2823 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2824 } 2825 cnt += sizes[i]*sizes[i]; 2826 row += sizes[i]; 2827 } 2828 a->inode.ibdiagvalid = PETSC_TRUE; 2829 } 2830 ibdiag = a->inode.ibdiag; 2831 bdiag = a->inode.bdiag; 2832 t = a->inode.ssor_work; 2833 2834 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2835 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2836 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2837 if (flag & SOR_ZERO_INITIAL_GUESS) { 2838 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2839 2840 for (i=0, row=0; i<m; i++) { 2841 sz = diag[row] - ii[row]; 2842 v1 = a->a + ii[row]; 2843 idx = a->j + ii[row]; 2844 2845 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2846 switch (sizes[i]) { 2847 case 1: 2848 2849 sum1 = b[row]; 2850 for (n = 0; n<sz-1; n+=2) { 2851 i1 = idx[0]; 2852 i2 = idx[1]; 2853 idx += 2; 2854 tmp0 = x[i1]; 2855 tmp1 = x[i2]; 2856 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2857 } 2858 2859 if (n == sz-1) { 2860 tmp0 = x[*idx]; 2861 sum1 -= *v1 * tmp0; 2862 } 2863 t[row] = sum1; 2864 x[row++] = sum1*(*ibdiag++); 2865 break; 2866 case 2: 2867 v2 = a->a + ii[row+1]; 2868 sum1 = b[row]; 2869 sum2 = b[row+1]; 2870 for (n = 0; n<sz-1; n+=2) { 2871 i1 = idx[0]; 2872 i2 = idx[1]; 2873 idx += 2; 2874 tmp0 = x[i1]; 2875 tmp1 = x[i2]; 2876 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2877 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2878 } 2879 2880 if (n == sz-1) { 2881 tmp0 = x[*idx]; 2882 sum1 -= v1[0] * tmp0; 2883 sum2 -= v2[0] * tmp0; 2884 } 2885 t[row] = sum1; 2886 t[row+1] = sum2; 2887 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2888 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2889 ibdiag += 4; 2890 break; 2891 case 3: 2892 v2 = a->a + ii[row+1]; 2893 v3 = a->a + ii[row+2]; 2894 sum1 = b[row]; 2895 sum2 = b[row+1]; 2896 sum3 = b[row+2]; 2897 for (n = 0; n<sz-1; n+=2) { 2898 i1 = idx[0]; 2899 i2 = idx[1]; 2900 idx += 2; 2901 tmp0 = x[i1]; 2902 tmp1 = x[i2]; 2903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2906 } 2907 2908 if (n == sz-1) { 2909 tmp0 = x[*idx]; 2910 sum1 -= v1[0] * tmp0; 2911 sum2 -= v2[0] * tmp0; 2912 sum3 -= v3[0] * tmp0; 2913 } 2914 t[row] = sum1; 2915 t[row+1] = sum2; 2916 t[row+2] = sum3; 2917 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2918 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2919 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2920 ibdiag += 9; 2921 break; 2922 case 4: 2923 v2 = a->a + ii[row+1]; 2924 v3 = a->a + ii[row+2]; 2925 v4 = a->a + ii[row+3]; 2926 sum1 = b[row]; 2927 sum2 = b[row+1]; 2928 sum3 = b[row+2]; 2929 sum4 = b[row+3]; 2930 for (n = 0; n<sz-1; n+=2) { 2931 i1 = idx[0]; 2932 i2 = idx[1]; 2933 idx += 2; 2934 tmp0 = x[i1]; 2935 tmp1 = x[i2]; 2936 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2937 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2938 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2939 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2940 } 2941 2942 if (n == sz-1) { 2943 tmp0 = x[*idx]; 2944 sum1 -= v1[0] * tmp0; 2945 sum2 -= v2[0] * tmp0; 2946 sum3 -= v3[0] * tmp0; 2947 sum4 -= v4[0] * tmp0; 2948 } 2949 t[row] = sum1; 2950 t[row+1] = sum2; 2951 t[row+2] = sum3; 2952 t[row+3] = sum4; 2953 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2954 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2955 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2956 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2957 ibdiag += 16; 2958 break; 2959 case 5: 2960 v2 = a->a + ii[row+1]; 2961 v3 = a->a + ii[row+2]; 2962 v4 = a->a + ii[row+3]; 2963 v5 = a->a + ii[row+4]; 2964 sum1 = b[row]; 2965 sum2 = b[row+1]; 2966 sum3 = b[row+2]; 2967 sum4 = b[row+3]; 2968 sum5 = b[row+4]; 2969 for (n = 0; n<sz-1; n+=2) { 2970 i1 = idx[0]; 2971 i2 = idx[1]; 2972 idx += 2; 2973 tmp0 = x[i1]; 2974 tmp1 = x[i2]; 2975 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2976 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2977 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2978 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2979 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2980 } 2981 2982 if (n == sz-1) { 2983 tmp0 = x[*idx]; 2984 sum1 -= v1[0] * tmp0; 2985 sum2 -= v2[0] * tmp0; 2986 sum3 -= v3[0] * tmp0; 2987 sum4 -= v4[0] * tmp0; 2988 sum5 -= v5[0] * tmp0; 2989 } 2990 t[row] = sum1; 2991 t[row+1] = sum2; 2992 t[row+2] = sum3; 2993 t[row+3] = sum4; 2994 t[row+4] = sum5; 2995 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2996 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2997 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2998 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2999 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3000 ibdiag += 25; 3001 break; 3002 default: 3003 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3004 } 3005 } 3006 3007 xb = t; 3008 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3009 } else xb = b; 3010 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3011 3012 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3013 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3014 ibdiag -= sizes[i]*sizes[i]; 3015 sz = ii[row+1] - diag[row] - 1; 3016 v1 = a->a + diag[row] + 1; 3017 idx = a->j + diag[row] + 1; 3018 3019 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3020 switch (sizes[i]) { 3021 case 1: 3022 3023 sum1 = xb[row]; 3024 for (n = 0; n<sz-1; n+=2) { 3025 i1 = idx[0]; 3026 i2 = idx[1]; 3027 idx += 2; 3028 tmp0 = x[i1]; 3029 tmp1 = x[i2]; 3030 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3031 } 3032 3033 if (n == sz-1) { 3034 tmp0 = x[*idx]; 3035 sum1 -= *v1*tmp0; 3036 } 3037 x[row--] = sum1*(*ibdiag); 3038 break; 3039 3040 case 2: 3041 3042 sum1 = xb[row]; 3043 sum2 = xb[row-1]; 3044 /* note that sum1 is associated with the second of the two rows */ 3045 v2 = a->a + diag[row-1] + 2; 3046 for (n = 0; n<sz-1; n+=2) { 3047 i1 = idx[0]; 3048 i2 = idx[1]; 3049 idx += 2; 3050 tmp0 = x[i1]; 3051 tmp1 = x[i2]; 3052 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3053 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3054 } 3055 3056 if (n == sz-1) { 3057 tmp0 = x[*idx]; 3058 sum1 -= *v1*tmp0; 3059 sum2 -= *v2*tmp0; 3060 } 3061 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3062 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3063 break; 3064 case 3: 3065 3066 sum1 = xb[row]; 3067 sum2 = xb[row-1]; 3068 sum3 = xb[row-2]; 3069 v2 = a->a + diag[row-1] + 2; 3070 v3 = a->a + diag[row-2] + 3; 3071 for (n = 0; n<sz-1; n+=2) { 3072 i1 = idx[0]; 3073 i2 = idx[1]; 3074 idx += 2; 3075 tmp0 = x[i1]; 3076 tmp1 = x[i2]; 3077 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3078 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3079 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3080 } 3081 3082 if (n == sz-1) { 3083 tmp0 = x[*idx]; 3084 sum1 -= *v1*tmp0; 3085 sum2 -= *v2*tmp0; 3086 sum3 -= *v3*tmp0; 3087 } 3088 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3089 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3090 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3091 break; 3092 case 4: 3093 3094 sum1 = xb[row]; 3095 sum2 = xb[row-1]; 3096 sum3 = xb[row-2]; 3097 sum4 = xb[row-3]; 3098 v2 = a->a + diag[row-1] + 2; 3099 v3 = a->a + diag[row-2] + 3; 3100 v4 = a->a + diag[row-3] + 4; 3101 for (n = 0; n<sz-1; n+=2) { 3102 i1 = idx[0]; 3103 i2 = idx[1]; 3104 idx += 2; 3105 tmp0 = x[i1]; 3106 tmp1 = x[i2]; 3107 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3108 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3109 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3110 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3111 } 3112 3113 if (n == sz-1) { 3114 tmp0 = x[*idx]; 3115 sum1 -= *v1*tmp0; 3116 sum2 -= *v2*tmp0; 3117 sum3 -= *v3*tmp0; 3118 sum4 -= *v4*tmp0; 3119 } 3120 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3121 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3122 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3123 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3124 break; 3125 case 5: 3126 3127 sum1 = xb[row]; 3128 sum2 = xb[row-1]; 3129 sum3 = xb[row-2]; 3130 sum4 = xb[row-3]; 3131 sum5 = xb[row-4]; 3132 v2 = a->a + diag[row-1] + 2; 3133 v3 = a->a + diag[row-2] + 3; 3134 v4 = a->a + diag[row-3] + 4; 3135 v5 = a->a + diag[row-4] + 5; 3136 for (n = 0; n<sz-1; n+=2) { 3137 i1 = idx[0]; 3138 i2 = idx[1]; 3139 idx += 2; 3140 tmp0 = x[i1]; 3141 tmp1 = x[i2]; 3142 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3143 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3144 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3145 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3146 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3147 } 3148 3149 if (n == sz-1) { 3150 tmp0 = x[*idx]; 3151 sum1 -= *v1*tmp0; 3152 sum2 -= *v2*tmp0; 3153 sum3 -= *v3*tmp0; 3154 sum4 -= *v4*tmp0; 3155 sum5 -= *v5*tmp0; 3156 } 3157 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3158 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3159 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3160 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3161 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3162 break; 3163 default: 3164 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3165 } 3166 } 3167 3168 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3169 } 3170 its--; 3171 } 3172 while (its--) { 3173 3174 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3175 ibdiag = a->inode.ibdiag; 3176 3177 for (i=0, row=0; i<m; i++) { 3178 sz = ii[row + 1] - ii[row]; 3179 v1 = a->a + ii[row]; 3180 idx = a->j + ii[row]; 3181 3182 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3183 switch (sizes[i]) { 3184 case 1: 3185 3186 sum1 = b[row]; 3187 for (n = 0; n<sz-1; n+=2) { 3188 i1 = idx[0]; 3189 i2 = idx[1]; 3190 idx += 2; 3191 tmp0 = x[i1]; 3192 tmp1 = x[i2]; 3193 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3194 } 3195 3196 if (n == sz-1) { 3197 tmp0 = x[*idx]; 3198 sum1 -= *v1 * tmp0; 3199 } 3200 3201 /* in MatSOR_SeqAIJ this line would be 3202 * 3203 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3204 * 3205 * but omega == 1, so this becomes 3206 * 3207 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3208 * 3209 * but bdiag and ibdiag cancel each other, so we can change this 3210 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3211 * block sizes as well 3212 */ 3213 x[row++] += sum1*(*ibdiag++); 3214 break; 3215 case 2: 3216 v2 = a->a + ii[row+1]; 3217 sum1 = b[row]; 3218 sum2 = b[row+1]; 3219 for (n = 0; n<sz-1; n+=2) { 3220 i1 = idx[0]; 3221 i2 = idx[1]; 3222 idx += 2; 3223 tmp0 = x[i1]; 3224 tmp1 = x[i2]; 3225 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3226 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3227 } 3228 3229 if (n == sz-1) { 3230 tmp0 = x[*idx]; 3231 sum1 -= v1[0] * tmp0; 3232 sum2 -= v2[0] * tmp0; 3233 } 3234 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3235 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3236 ibdiag += 4; 3237 break; 3238 case 3: 3239 v2 = a->a + ii[row+1]; 3240 v3 = a->a + ii[row+2]; 3241 sum1 = b[row]; 3242 sum2 = b[row+1]; 3243 sum3 = b[row+2]; 3244 for (n = 0; n<sz-1; n+=2) { 3245 i1 = idx[0]; 3246 i2 = idx[1]; 3247 idx += 2; 3248 tmp0 = x[i1]; 3249 tmp1 = x[i2]; 3250 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3251 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3252 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3253 } 3254 3255 if (n == sz-1) { 3256 tmp0 = x[*idx]; 3257 sum1 -= v1[0] * tmp0; 3258 sum2 -= v2[0] * tmp0; 3259 sum3 -= v3[0] * tmp0; 3260 } 3261 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3262 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3263 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3264 ibdiag += 9; 3265 break; 3266 case 4: 3267 v2 = a->a + ii[row+1]; 3268 v3 = a->a + ii[row+2]; 3269 v4 = a->a + ii[row+3]; 3270 sum1 = b[row]; 3271 sum2 = b[row+1]; 3272 sum3 = b[row+2]; 3273 sum4 = b[row+3]; 3274 for (n = 0; n<sz-1; n+=2) { 3275 i1 = idx[0]; 3276 i2 = idx[1]; 3277 idx += 2; 3278 tmp0 = x[i1]; 3279 tmp1 = x[i2]; 3280 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3281 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3282 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3283 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3284 } 3285 3286 if (n == sz-1) { 3287 tmp0 = x[*idx]; 3288 sum1 -= v1[0] * tmp0; 3289 sum2 -= v2[0] * tmp0; 3290 sum3 -= v3[0] * tmp0; 3291 sum4 -= v4[0] * tmp0; 3292 } 3293 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3294 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3295 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3296 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3297 ibdiag += 16; 3298 break; 3299 case 5: 3300 v2 = a->a + ii[row+1]; 3301 v3 = a->a + ii[row+2]; 3302 v4 = a->a + ii[row+3]; 3303 v5 = a->a + ii[row+4]; 3304 sum1 = b[row]; 3305 sum2 = b[row+1]; 3306 sum3 = b[row+2]; 3307 sum4 = b[row+3]; 3308 sum5 = b[row+4]; 3309 for (n = 0; n<sz-1; n+=2) { 3310 i1 = idx[0]; 3311 i2 = idx[1]; 3312 idx += 2; 3313 tmp0 = x[i1]; 3314 tmp1 = x[i2]; 3315 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3316 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3317 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3318 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3319 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3320 } 3321 3322 if (n == sz-1) { 3323 tmp0 = x[*idx]; 3324 sum1 -= v1[0] * tmp0; 3325 sum2 -= v2[0] * tmp0; 3326 sum3 -= v3[0] * tmp0; 3327 sum4 -= v4[0] * tmp0; 3328 sum5 -= v5[0] * tmp0; 3329 } 3330 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3331 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3332 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3333 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3334 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3335 ibdiag += 25; 3336 break; 3337 default: 3338 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3339 } 3340 } 3341 3342 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3343 } 3344 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3345 3346 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3347 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3348 ibdiag -= sizes[i]*sizes[i]; 3349 sz = ii[row+1] - ii[row]; 3350 v1 = a->a + ii[row]; 3351 idx = a->j + ii[row]; 3352 3353 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3354 switch (sizes[i]) { 3355 case 1: 3356 3357 sum1 = b[row]; 3358 for (n = 0; n<sz-1; n+=2) { 3359 i1 = idx[0]; 3360 i2 = idx[1]; 3361 idx += 2; 3362 tmp0 = x[i1]; 3363 tmp1 = x[i2]; 3364 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3365 } 3366 3367 if (n == sz-1) { 3368 tmp0 = x[*idx]; 3369 sum1 -= *v1*tmp0; 3370 } 3371 x[row--] += sum1*(*ibdiag); 3372 break; 3373 3374 case 2: 3375 3376 sum1 = b[row]; 3377 sum2 = b[row-1]; 3378 /* note that sum1 is associated with the second of the two rows */ 3379 v2 = a->a + ii[row - 1]; 3380 for (n = 0; n<sz-1; n+=2) { 3381 i1 = idx[0]; 3382 i2 = idx[1]; 3383 idx += 2; 3384 tmp0 = x[i1]; 3385 tmp1 = x[i2]; 3386 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3387 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3388 } 3389 3390 if (n == sz-1) { 3391 tmp0 = x[*idx]; 3392 sum1 -= *v1*tmp0; 3393 sum2 -= *v2*tmp0; 3394 } 3395 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3396 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3397 break; 3398 case 3: 3399 3400 sum1 = b[row]; 3401 sum2 = b[row-1]; 3402 sum3 = b[row-2]; 3403 v2 = a->a + ii[row-1]; 3404 v3 = a->a + ii[row-2]; 3405 for (n = 0; n<sz-1; n+=2) { 3406 i1 = idx[0]; 3407 i2 = idx[1]; 3408 idx += 2; 3409 tmp0 = x[i1]; 3410 tmp1 = x[i2]; 3411 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3412 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3413 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3414 } 3415 3416 if (n == sz-1) { 3417 tmp0 = x[*idx]; 3418 sum1 -= *v1*tmp0; 3419 sum2 -= *v2*tmp0; 3420 sum3 -= *v3*tmp0; 3421 } 3422 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3423 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3424 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3425 break; 3426 case 4: 3427 3428 sum1 = b[row]; 3429 sum2 = b[row-1]; 3430 sum3 = b[row-2]; 3431 sum4 = b[row-3]; 3432 v2 = a->a + ii[row-1]; 3433 v3 = a->a + ii[row-2]; 3434 v4 = a->a + ii[row-3]; 3435 for (n = 0; n<sz-1; n+=2) { 3436 i1 = idx[0]; 3437 i2 = idx[1]; 3438 idx += 2; 3439 tmp0 = x[i1]; 3440 tmp1 = x[i2]; 3441 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3442 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3443 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3444 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3445 } 3446 3447 if (n == sz-1) { 3448 tmp0 = x[*idx]; 3449 sum1 -= *v1*tmp0; 3450 sum2 -= *v2*tmp0; 3451 sum3 -= *v3*tmp0; 3452 sum4 -= *v4*tmp0; 3453 } 3454 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3455 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3456 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3457 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3458 break; 3459 case 5: 3460 3461 sum1 = b[row]; 3462 sum2 = b[row-1]; 3463 sum3 = b[row-2]; 3464 sum4 = b[row-3]; 3465 sum5 = b[row-4]; 3466 v2 = a->a + ii[row-1]; 3467 v3 = a->a + ii[row-2]; 3468 v4 = a->a + ii[row-3]; 3469 v5 = a->a + ii[row-4]; 3470 for (n = 0; n<sz-1; n+=2) { 3471 i1 = idx[0]; 3472 i2 = idx[1]; 3473 idx += 2; 3474 tmp0 = x[i1]; 3475 tmp1 = x[i2]; 3476 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3477 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3478 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3479 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3480 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3481 } 3482 3483 if (n == sz-1) { 3484 tmp0 = x[*idx]; 3485 sum1 -= *v1*tmp0; 3486 sum2 -= *v2*tmp0; 3487 sum3 -= *v3*tmp0; 3488 sum4 -= *v4*tmp0; 3489 sum5 -= *v5*tmp0; 3490 } 3491 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3492 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3493 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3494 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3495 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3496 break; 3497 default: 3498 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3499 } 3500 } 3501 3502 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3503 } 3504 } 3505 if (flag & SOR_EISENSTAT) { 3506 /* 3507 Apply (U + D)^-1 where D is now the block diagonal 3508 */ 3509 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3510 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3511 ibdiag -= sizes[i]*sizes[i]; 3512 sz = ii[row+1] - diag[row] - 1; 3513 v1 = a->a + diag[row] + 1; 3514 idx = a->j + diag[row] + 1; 3515 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3516 switch (sizes[i]) { 3517 case 1: 3518 3519 sum1 = b[row]; 3520 for (n = 0; n<sz-1; n+=2) { 3521 i1 = idx[0]; 3522 i2 = idx[1]; 3523 idx += 2; 3524 tmp0 = x[i1]; 3525 tmp1 = x[i2]; 3526 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3527 } 3528 3529 if (n == sz-1) { 3530 tmp0 = x[*idx]; 3531 sum1 -= *v1*tmp0; 3532 } 3533 x[row] = sum1*(*ibdiag);row--; 3534 break; 3535 3536 case 2: 3537 3538 sum1 = b[row]; 3539 sum2 = b[row-1]; 3540 /* note that sum1 is associated with the second of the two rows */ 3541 v2 = a->a + diag[row-1] + 2; 3542 for (n = 0; n<sz-1; n+=2) { 3543 i1 = idx[0]; 3544 i2 = idx[1]; 3545 idx += 2; 3546 tmp0 = x[i1]; 3547 tmp1 = x[i2]; 3548 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3549 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3550 } 3551 3552 if (n == sz-1) { 3553 tmp0 = x[*idx]; 3554 sum1 -= *v1*tmp0; 3555 sum2 -= *v2*tmp0; 3556 } 3557 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3558 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3559 row -= 2; 3560 break; 3561 case 3: 3562 3563 sum1 = b[row]; 3564 sum2 = b[row-1]; 3565 sum3 = b[row-2]; 3566 v2 = a->a + diag[row-1] + 2; 3567 v3 = a->a + diag[row-2] + 3; 3568 for (n = 0; n<sz-1; n+=2) { 3569 i1 = idx[0]; 3570 i2 = idx[1]; 3571 idx += 2; 3572 tmp0 = x[i1]; 3573 tmp1 = x[i2]; 3574 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3575 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3576 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3577 } 3578 3579 if (n == sz-1) { 3580 tmp0 = x[*idx]; 3581 sum1 -= *v1*tmp0; 3582 sum2 -= *v2*tmp0; 3583 sum3 -= *v3*tmp0; 3584 } 3585 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3586 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3587 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3588 row -= 3; 3589 break; 3590 case 4: 3591 3592 sum1 = b[row]; 3593 sum2 = b[row-1]; 3594 sum3 = b[row-2]; 3595 sum4 = b[row-3]; 3596 v2 = a->a + diag[row-1] + 2; 3597 v3 = a->a + diag[row-2] + 3; 3598 v4 = a->a + diag[row-3] + 4; 3599 for (n = 0; n<sz-1; n+=2) { 3600 i1 = idx[0]; 3601 i2 = idx[1]; 3602 idx += 2; 3603 tmp0 = x[i1]; 3604 tmp1 = x[i2]; 3605 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3606 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3607 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3608 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3609 } 3610 3611 if (n == sz-1) { 3612 tmp0 = x[*idx]; 3613 sum1 -= *v1*tmp0; 3614 sum2 -= *v2*tmp0; 3615 sum3 -= *v3*tmp0; 3616 sum4 -= *v4*tmp0; 3617 } 3618 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3619 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3620 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3621 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3622 row -= 4; 3623 break; 3624 case 5: 3625 3626 sum1 = b[row]; 3627 sum2 = b[row-1]; 3628 sum3 = b[row-2]; 3629 sum4 = b[row-3]; 3630 sum5 = b[row-4]; 3631 v2 = a->a + diag[row-1] + 2; 3632 v3 = a->a + diag[row-2] + 3; 3633 v4 = a->a + diag[row-3] + 4; 3634 v5 = a->a + diag[row-4] + 5; 3635 for (n = 0; n<sz-1; n+=2) { 3636 i1 = idx[0]; 3637 i2 = idx[1]; 3638 idx += 2; 3639 tmp0 = x[i1]; 3640 tmp1 = x[i2]; 3641 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3642 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3643 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3644 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3645 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3646 } 3647 3648 if (n == sz-1) { 3649 tmp0 = x[*idx]; 3650 sum1 -= *v1*tmp0; 3651 sum2 -= *v2*tmp0; 3652 sum3 -= *v3*tmp0; 3653 sum4 -= *v4*tmp0; 3654 sum5 -= *v5*tmp0; 3655 } 3656 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3657 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3658 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3659 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3660 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3661 row -= 5; 3662 break; 3663 default: 3664 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3665 } 3666 } 3667 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3668 3669 /* 3670 t = b - D x where D is the block diagonal 3671 */ 3672 cnt = 0; 3673 for (i=0, row=0; i<m; i++) { 3674 switch (sizes[i]) { 3675 case 1: 3676 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3677 break; 3678 case 2: 3679 x1 = x[row]; x2 = x[row+1]; 3680 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3681 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3682 t[row] = b[row] - tmp1; 3683 t[row+1] = b[row+1] - tmp2; row += 2; 3684 cnt += 4; 3685 break; 3686 case 3: 3687 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3688 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3689 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3690 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3691 t[row] = b[row] - tmp1; 3692 t[row+1] = b[row+1] - tmp2; 3693 t[row+2] = b[row+2] - tmp3; row += 3; 3694 cnt += 9; 3695 break; 3696 case 4: 3697 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3698 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3699 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3700 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3701 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3702 t[row] = b[row] - tmp1; 3703 t[row+1] = b[row+1] - tmp2; 3704 t[row+2] = b[row+2] - tmp3; 3705 t[row+3] = b[row+3] - tmp4; row += 4; 3706 cnt += 16; 3707 break; 3708 case 5: 3709 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3710 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3711 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3712 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3713 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3714 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3715 t[row] = b[row] - tmp1; 3716 t[row+1] = b[row+1] - tmp2; 3717 t[row+2] = b[row+2] - tmp3; 3718 t[row+3] = b[row+3] - tmp4; 3719 t[row+4] = b[row+4] - tmp5;row += 5; 3720 cnt += 25; 3721 break; 3722 default: 3723 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3724 } 3725 } 3726 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3727 3728 3729 3730 /* 3731 Apply (L + D)^-1 where D is the block diagonal 3732 */ 3733 for (i=0, row=0; i<m; i++) { 3734 sz = diag[row] - ii[row]; 3735 v1 = a->a + ii[row]; 3736 idx = a->j + ii[row]; 3737 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3738 switch (sizes[i]) { 3739 case 1: 3740 3741 sum1 = t[row]; 3742 for (n = 0; n<sz-1; n+=2) { 3743 i1 = idx[0]; 3744 i2 = idx[1]; 3745 idx += 2; 3746 tmp0 = t[i1]; 3747 tmp1 = t[i2]; 3748 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3749 } 3750 3751 if (n == sz-1) { 3752 tmp0 = t[*idx]; 3753 sum1 -= *v1 * tmp0; 3754 } 3755 x[row] += t[row] = sum1*(*ibdiag++); row++; 3756 break; 3757 case 2: 3758 v2 = a->a + ii[row+1]; 3759 sum1 = t[row]; 3760 sum2 = t[row+1]; 3761 for (n = 0; n<sz-1; n+=2) { 3762 i1 = idx[0]; 3763 i2 = idx[1]; 3764 idx += 2; 3765 tmp0 = t[i1]; 3766 tmp1 = t[i2]; 3767 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3768 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3769 } 3770 3771 if (n == sz-1) { 3772 tmp0 = t[*idx]; 3773 sum1 -= v1[0] * tmp0; 3774 sum2 -= v2[0] * tmp0; 3775 } 3776 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3777 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3778 ibdiag += 4; row += 2; 3779 break; 3780 case 3: 3781 v2 = a->a + ii[row+1]; 3782 v3 = a->a + ii[row+2]; 3783 sum1 = t[row]; 3784 sum2 = t[row+1]; 3785 sum3 = t[row+2]; 3786 for (n = 0; n<sz-1; n+=2) { 3787 i1 = idx[0]; 3788 i2 = idx[1]; 3789 idx += 2; 3790 tmp0 = t[i1]; 3791 tmp1 = t[i2]; 3792 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3793 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3794 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3795 } 3796 3797 if (n == sz-1) { 3798 tmp0 = t[*idx]; 3799 sum1 -= v1[0] * tmp0; 3800 sum2 -= v2[0] * tmp0; 3801 sum3 -= v3[0] * tmp0; 3802 } 3803 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3804 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3805 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3806 ibdiag += 9; row += 3; 3807 break; 3808 case 4: 3809 v2 = a->a + ii[row+1]; 3810 v3 = a->a + ii[row+2]; 3811 v4 = a->a + ii[row+3]; 3812 sum1 = t[row]; 3813 sum2 = t[row+1]; 3814 sum3 = t[row+2]; 3815 sum4 = t[row+3]; 3816 for (n = 0; n<sz-1; n+=2) { 3817 i1 = idx[0]; 3818 i2 = idx[1]; 3819 idx += 2; 3820 tmp0 = t[i1]; 3821 tmp1 = t[i2]; 3822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3824 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3825 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3826 } 3827 3828 if (n == sz-1) { 3829 tmp0 = t[*idx]; 3830 sum1 -= v1[0] * tmp0; 3831 sum2 -= v2[0] * tmp0; 3832 sum3 -= v3[0] * tmp0; 3833 sum4 -= v4[0] * tmp0; 3834 } 3835 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3836 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3837 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3838 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3839 ibdiag += 16; row += 4; 3840 break; 3841 case 5: 3842 v2 = a->a + ii[row+1]; 3843 v3 = a->a + ii[row+2]; 3844 v4 = a->a + ii[row+3]; 3845 v5 = a->a + ii[row+4]; 3846 sum1 = t[row]; 3847 sum2 = t[row+1]; 3848 sum3 = t[row+2]; 3849 sum4 = t[row+3]; 3850 sum5 = t[row+4]; 3851 for (n = 0; n<sz-1; n+=2) { 3852 i1 = idx[0]; 3853 i2 = idx[1]; 3854 idx += 2; 3855 tmp0 = t[i1]; 3856 tmp1 = t[i2]; 3857 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3858 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3859 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3860 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3861 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3862 } 3863 3864 if (n == sz-1) { 3865 tmp0 = t[*idx]; 3866 sum1 -= v1[0] * tmp0; 3867 sum2 -= v2[0] * tmp0; 3868 sum3 -= v3[0] * tmp0; 3869 sum4 -= v4[0] * tmp0; 3870 sum5 -= v5[0] * tmp0; 3871 } 3872 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3873 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3874 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3875 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3876 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3877 ibdiag += 25; row += 5; 3878 break; 3879 default: 3880 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3881 } 3882 } 3883 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3884 } 3885 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3886 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3887 PetscFunctionReturn(0); 3888 } 3889 3890 #undef __FUNCT__ 3891 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3892 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3893 { 3894 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3895 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3896 const MatScalar *bdiag = a->inode.bdiag; 3897 const PetscScalar *b; 3898 PetscErrorCode ierr; 3899 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3900 const PetscInt *sizes = a->inode.size; 3901 3902 PetscFunctionBegin; 3903 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3904 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3905 cnt = 0; 3906 for (i=0, row=0; i<m; i++) { 3907 switch (sizes[i]) { 3908 case 1: 3909 x[row] = b[row]*bdiag[cnt++];row++; 3910 break; 3911 case 2: 3912 x1 = b[row]; x2 = b[row+1]; 3913 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3914 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3915 x[row++] = tmp1; 3916 x[row++] = tmp2; 3917 cnt += 4; 3918 break; 3919 case 3: 3920 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3921 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3922 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3923 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3924 x[row++] = tmp1; 3925 x[row++] = tmp2; 3926 x[row++] = tmp3; 3927 cnt += 9; 3928 break; 3929 case 4: 3930 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3931 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3932 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3933 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3934 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3935 x[row++] = tmp1; 3936 x[row++] = tmp2; 3937 x[row++] = tmp3; 3938 x[row++] = tmp4; 3939 cnt += 16; 3940 break; 3941 case 5: 3942 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3943 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3944 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3945 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3946 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3947 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3948 x[row++] = tmp1; 3949 x[row++] = tmp2; 3950 x[row++] = tmp3; 3951 x[row++] = tmp4; 3952 x[row++] = tmp5; 3953 cnt += 25; 3954 break; 3955 default: 3956 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3957 } 3958 } 3959 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3960 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3961 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3962 PetscFunctionReturn(0); 3963 } 3964 3965 /* 3966 samestructure indicates that the matrix has not changed its nonzero structure so we 3967 do not need to recompute the inodes 3968 */ 3969 #undef __FUNCT__ 3970 #define __FUNCT__ "Mat_CheckInode" 3971 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3972 { 3973 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3974 PetscErrorCode ierr; 3975 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 3976 PetscBool flag; 3977 const PetscInt *idx,*idy,*ii; 3978 3979 PetscFunctionBegin; 3980 if (!a->inode.use) PetscFunctionReturn(0); 3981 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3982 3983 3984 m = A->rmap->n; 3985 if (a->inode.size) ns = a->inode.size; 3986 else { 3987 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 3988 } 3989 3990 i = 0; 3991 node_count = 0; 3992 idx = a->j; 3993 ii = a->i; 3994 while (i < m) { /* For each row */ 3995 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3996 /* Limits the number of elements in a node to 'a->inode.limit' */ 3997 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3998 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3999 if (nzy != nzx) break; 4000 idy += nzx; /* Same nonzero pattern */ 4001 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4002 if (!flag) break; 4003 } 4004 ns[node_count++] = blk_size; 4005 idx += blk_size*nzx; 4006 i = j; 4007 } 4008 /* If not enough inodes found,, do not use inode version of the routines */ 4009 if (!m || node_count > .8*m) { 4010 ierr = PetscFree(ns);CHKERRQ(ierr); 4011 4012 a->inode.node_count = 0; 4013 a->inode.size = NULL; 4014 a->inode.use = PETSC_FALSE; 4015 A->ops->mult = MatMult_SeqAIJ; 4016 A->ops->sor = MatSOR_SeqAIJ; 4017 A->ops->multadd = MatMultAdd_SeqAIJ; 4018 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4019 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4020 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4021 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4022 A->ops->coloringpatch = 0; 4023 A->ops->multdiagonalblock = 0; 4024 4025 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4026 } else { 4027 if (!A->factortype) { 4028 A->ops->mult = MatMult_SeqAIJ_Inode; 4029 A->ops->sor = MatSOR_SeqAIJ_Inode; 4030 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4031 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4032 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4033 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4034 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4035 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4036 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4037 } else { 4038 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4039 } 4040 a->inode.node_count = node_count; 4041 a->inode.size = ns; 4042 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4043 } 4044 a->inode.checked = PETSC_TRUE; 4045 PetscFunctionReturn(0); 4046 } 4047 4048 #undef __FUNCT__ 4049 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4050 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4051 { 4052 Mat B =*C; 4053 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4054 PetscErrorCode ierr; 4055 PetscInt m=A->rmap->n; 4056 4057 PetscFunctionBegin; 4058 c->inode.use = a->inode.use; 4059 c->inode.limit = a->inode.limit; 4060 c->inode.max_limit = a->inode.max_limit; 4061 if (a->inode.size) { 4062 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4063 c->inode.node_count = a->inode.node_count; 4064 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4065 /* note the table of functions below should match that in Mat_CheckInode() */ 4066 if (!B->factortype) { 4067 B->ops->mult = MatMult_SeqAIJ_Inode; 4068 B->ops->sor = MatSOR_SeqAIJ_Inode; 4069 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4070 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4071 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4072 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4073 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4074 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4075 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4076 } else { 4077 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4078 } 4079 } else { 4080 c->inode.size = 0; 4081 c->inode.node_count = 0; 4082 } 4083 c->inode.ibdiagvalid = PETSC_FALSE; 4084 c->inode.ibdiag = 0; 4085 c->inode.bdiag = 0; 4086 PetscFunctionReturn(0); 4087 } 4088 4089 #undef __FUNCT__ 4090 #define __FUNCT__ "MatGetRow_FactoredLU" 4091 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4092 { 4093 PetscInt k; 4094 const PetscInt *vi; 4095 4096 PetscFunctionBegin; 4097 vi = aj + ai[row]; 4098 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4099 vi = aj + adiag[row]; 4100 cols[nzl] = vi[0]; 4101 vi = aj + adiag[row+1]+1; 4102 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4103 PetscFunctionReturn(0); 4104 } 4105 /* 4106 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4107 Modified from Mat_CheckInode(). 4108 4109 Input Parameters: 4110 + Mat A - ILU or LU matrix factor 4111 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4112 do not need to recompute the inodes 4113 */ 4114 #undef __FUNCT__ 4115 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4116 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4117 { 4118 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4119 PetscErrorCode ierr; 4120 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4121 PetscInt *cols1,*cols2,*ns; 4122 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4123 PetscBool flag; 4124 4125 PetscFunctionBegin; 4126 if (!a->inode.use) PetscFunctionReturn(0); 4127 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4128 4129 m = A->rmap->n; 4130 if (a->inode.size) ns = a->inode.size; 4131 else { 4132 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 4133 } 4134 4135 i = 0; 4136 node_count = 0; 4137 ierr = PetscMalloc2(m,PetscInt,&cols1,m,PetscInt,&cols2);CHKERRQ(ierr); 4138 while (i < m) { /* For each row */ 4139 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4140 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4141 nzx = nzl1 + nzu1 + 1; 4142 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4143 4144 /* Limits the number of elements in a node to 'a->inode.limit' */ 4145 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4146 nzl2 = ai[j+1] - ai[j]; 4147 nzu2 = adiag[j] - adiag[j+1] - 1; 4148 nzy = nzl2 + nzu2 + 1; 4149 if (nzy != nzx) break; 4150 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4151 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4152 if (!flag) break; 4153 } 4154 ns[node_count++] = blk_size; 4155 i = j; 4156 } 4157 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4158 /* If not enough inodes found,, do not use inode version of the routines */ 4159 if (!m || node_count > .8*m) { 4160 ierr = PetscFree(ns);CHKERRQ(ierr); 4161 4162 a->inode.node_count = 0; 4163 a->inode.size = NULL; 4164 a->inode.use = PETSC_FALSE; 4165 4166 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4167 } else { 4168 A->ops->mult = 0; 4169 A->ops->sor = 0; 4170 A->ops->multadd = 0; 4171 A->ops->getrowij = 0; 4172 A->ops->restorerowij = 0; 4173 A->ops->getcolumnij = 0; 4174 A->ops->restorecolumnij = 0; 4175 A->ops->coloringpatch = 0; 4176 A->ops->multdiagonalblock = 0; 4177 A->ops->solve = MatSolve_SeqAIJ_Inode; 4178 a->inode.node_count = node_count; 4179 a->inode.size = ns; 4180 4181 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4182 } 4183 a->inode.checked = PETSC_TRUE; 4184 PetscFunctionReturn(0); 4185 } 4186 4187 #undef __FUNCT__ 4188 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4189 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4190 { 4191 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4192 4193 PetscFunctionBegin; 4194 a->inode.ibdiagvalid = PETSC_FALSE; 4195 PetscFunctionReturn(0); 4196 } 4197 4198 /* 4199 This is really ugly. if inodes are used this replaces the 4200 permutations with ones that correspond to rows/cols of the matrix 4201 rather then inode blocks 4202 */ 4203 #undef __FUNCT__ 4204 #define __FUNCT__ "MatInodeAdjustForInodes" 4205 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4206 { 4207 PetscErrorCode ierr; 4208 4209 PetscFunctionBegin; 4210 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4211 PetscFunctionReturn(0); 4212 } 4213 4214 #undef __FUNCT__ 4215 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4216 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4217 { 4218 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4219 PetscErrorCode ierr; 4220 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4221 const PetscInt *ridx,*cidx; 4222 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4223 PetscInt nslim_col,*ns_col; 4224 IS ris = *rperm,cis = *cperm; 4225 4226 PetscFunctionBegin; 4227 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4228 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4229 4230 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4231 ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4232 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4233 4234 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4235 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4236 4237 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4238 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4239 4240 /* Construct the permutations for rows*/ 4241 for (i=0,row = 0; i<nslim_row; ++i) { 4242 indx = ridx[i]; 4243 start_val = tns[indx]; 4244 end_val = tns[indx + 1]; 4245 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4246 } 4247 4248 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4249 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4250 4251 /* Construct permutations for columns */ 4252 for (i=0,col=0; i<nslim_col; ++i) { 4253 indx = cidx[i]; 4254 start_val = tns[indx]; 4255 end_val = tns[indx + 1]; 4256 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4257 } 4258 4259 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4260 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4261 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4262 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4263 4264 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4265 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4266 4267 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4268 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4269 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4270 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4271 ierr = PetscFree(tns);CHKERRQ(ierr); 4272 PetscFunctionReturn(0); 4273 } 4274 4275 #undef __FUNCT__ 4276 #define __FUNCT__ "MatInodeGetInodeSizes" 4277 /*@C 4278 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4279 4280 Not Collective 4281 4282 Input Parameter: 4283 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4284 4285 Output Parameter: 4286 + node_count - no of inodes present in the matrix. 4287 . sizes - an array of size node_count,with sizes of each inode. 4288 - limit - the max size used to generate the inodes. 4289 4290 Level: advanced 4291 4292 Notes: This routine returns some internal storage information 4293 of the matrix, it is intended to be used by advanced users. 4294 It should be called after the matrix is assembled. 4295 The contents of the sizes[] array should not be changed. 4296 NULL may be passed for information not requested. 4297 4298 .keywords: matrix, seqaij, get, inode 4299 4300 .seealso: MatGetInfo() 4301 @*/ 4302 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4303 { 4304 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4305 4306 PetscFunctionBegin; 4307 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4308 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4309 if (f) { 4310 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4311 } 4312 PetscFunctionReturn(0); 4313 } 4314 4315 #undef __FUNCT__ 4316 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4317 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4318 { 4319 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4320 4321 PetscFunctionBegin; 4322 if (node_count) *node_count = a->inode.node_count; 4323 if (sizes) *sizes = a->inode.size; 4324 if (limit) *limit = a->inode.limit; 4325 PetscFunctionReturn(0); 4326 } 4327