1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 600 if (zz != yy) { 601 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 602 } else { 603 z = y; 604 } 605 zt = z; 606 607 idx = a->j; 608 v1 = a->a; 609 ii = a->i; 610 611 for (i = 0,row = 0; i< node_max; ++i) { 612 nsz = ns[i]; 613 n = ii[1] - ii[0]; 614 ii += nsz; 615 sz = n; /* No of non zeros in this row */ 616 /* Switch on the size of Node */ 617 switch (nsz) { /* Each loop in 'case' is unrolled */ 618 case 1: 619 sum1 = *zt++; 620 621 for (n = 0; n< sz-1; n+=2) { 622 i1 = idx[0]; /* The instructions are ordered to */ 623 i2 = idx[1]; /* make the compiler's job easy */ 624 idx += 2; 625 tmp0 = x[i1]; 626 tmp1 = x[i2]; 627 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 628 } 629 630 if (n == sz-1) { /* Take care of the last nonzero */ 631 tmp0 = x[*idx++]; 632 sum1 += *v1++ * tmp0; 633 } 634 y[row++]=sum1; 635 break; 636 case 2: 637 sum1 = *zt++; 638 sum2 = *zt++; 639 v2 = v1 + n; 640 641 for (n = 0; n< sz-1; n+=2) { 642 i1 = idx[0]; 643 i2 = idx[1]; 644 idx += 2; 645 tmp0 = x[i1]; 646 tmp1 = x[i2]; 647 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 648 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 649 } 650 if (n == sz-1) { 651 tmp0 = x[*idx++]; 652 sum1 += *v1++ * tmp0; 653 sum2 += *v2++ * tmp0; 654 } 655 y[row++]=sum1; 656 y[row++]=sum2; 657 v1 =v2; /* Since the next block to be processed starts there*/ 658 idx +=sz; 659 break; 660 case 3: 661 sum1 = *zt++; 662 sum2 = *zt++; 663 sum3 = *zt++; 664 v2 = v1 + n; 665 v3 = v2 + n; 666 667 for (n = 0; n< sz-1; n+=2) { 668 i1 = idx[0]; 669 i2 = idx[1]; 670 idx += 2; 671 tmp0 = x[i1]; 672 tmp1 = x[i2]; 673 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 674 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 675 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 676 } 677 if (n == sz-1) { 678 tmp0 = x[*idx++]; 679 sum1 += *v1++ * tmp0; 680 sum2 += *v2++ * tmp0; 681 sum3 += *v3++ * tmp0; 682 } 683 y[row++]=sum1; 684 y[row++]=sum2; 685 y[row++]=sum3; 686 v1 =v3; /* Since the next block to be processed starts there*/ 687 idx +=2*sz; 688 break; 689 case 4: 690 sum1 = *zt++; 691 sum2 = *zt++; 692 sum3 = *zt++; 693 sum4 = *zt++; 694 v2 = v1 + n; 695 v3 = v2 + n; 696 v4 = v3 + n; 697 698 for (n = 0; n< sz-1; n+=2) { 699 i1 = idx[0]; 700 i2 = idx[1]; 701 idx += 2; 702 tmp0 = x[i1]; 703 tmp1 = x[i2]; 704 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 705 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 706 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 707 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 708 } 709 if (n == sz-1) { 710 tmp0 = x[*idx++]; 711 sum1 += *v1++ * tmp0; 712 sum2 += *v2++ * tmp0; 713 sum3 += *v3++ * tmp0; 714 sum4 += *v4++ * tmp0; 715 } 716 y[row++]=sum1; 717 y[row++]=sum2; 718 y[row++]=sum3; 719 y[row++]=sum4; 720 v1 =v4; /* Since the next block to be processed starts there*/ 721 idx +=3*sz; 722 break; 723 case 5: 724 sum1 = *zt++; 725 sum2 = *zt++; 726 sum3 = *zt++; 727 sum4 = *zt++; 728 sum5 = *zt++; 729 v2 = v1 + n; 730 v3 = v2 + n; 731 v4 = v3 + n; 732 v5 = v4 + n; 733 734 for (n = 0; n<sz-1; n+=2) { 735 i1 = idx[0]; 736 i2 = idx[1]; 737 idx += 2; 738 tmp0 = x[i1]; 739 tmp1 = x[i2]; 740 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 741 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 742 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 743 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 744 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 745 } 746 if (n == sz-1) { 747 tmp0 = x[*idx++]; 748 sum1 += *v1++ * tmp0; 749 sum2 += *v2++ * tmp0; 750 sum3 += *v3++ * tmp0; 751 sum4 += *v4++ * tmp0; 752 sum5 += *v5++ * tmp0; 753 } 754 y[row++]=sum1; 755 y[row++]=sum2; 756 y[row++]=sum3; 757 y[row++]=sum4; 758 y[row++]=sum5; 759 v1 =v5; /* Since the next block to be processed starts there */ 760 idx +=4*sz; 761 break; 762 default: 763 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 764 } 765 } 766 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 767 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 768 if (zz != yy) { 769 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 770 } 771 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 /* ----------------------------------------------------------- */ 776 #undef __FUNCT__ 777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 779 { 780 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 781 IS iscol = a->col,isrow = a->row; 782 PetscErrorCode ierr; 783 const PetscInt *r,*c,*rout,*cout; 784 PetscInt i,j,n = A->rmap->n,nz; 785 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 786 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp; 806 aa = a_a; 807 aj = a_j; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i) { 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 if (i < node_max-1) { 817 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 818 * but our indexing to determine it's size could. */ 819 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 820 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 821 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 822 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 823 } 824 825 switch (nsz) { /* Each loop in 'case' is unrolled */ 826 case 1: 827 sum1 = b[*r++]; 828 for (j=0; j<nz-1; j+=2) { 829 i0 = vi[0]; 830 i1 = vi[1]; 831 vi +=2; 832 tmp0 = tmps[i0]; 833 tmp1 = tmps[i1]; 834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 835 } 836 if (j == nz-1) { 837 tmp0 = tmps[*vi++]; 838 sum1 -= *v1++ *tmp0; 839 } 840 tmp[row++]=sum1; 841 break; 842 case 2: 843 sum1 = b[*r++]; 844 sum2 = b[*r++]; 845 v2 = aa + ai[row+1]; 846 847 for (j=0; j<nz-1; j+=2) { 848 i0 = vi[0]; 849 i1 = vi[1]; 850 vi +=2; 851 tmp0 = tmps[i0]; 852 tmp1 = tmps[i1]; 853 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 854 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 855 } 856 if (j == nz-1) { 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 } 861 sum2 -= *v2++ *sum1; 862 tmp[row++]=sum1; 863 tmp[row++]=sum2; 864 break; 865 case 3: 866 sum1 = b[*r++]; 867 sum2 = b[*r++]; 868 sum3 = b[*r++]; 869 v2 = aa + ai[row+1]; 870 v3 = aa + ai[row+2]; 871 872 for (j=0; j<nz-1; j+=2) { 873 i0 = vi[0]; 874 i1 = vi[1]; 875 vi +=2; 876 tmp0 = tmps[i0]; 877 tmp1 = tmps[i1]; 878 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 879 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 880 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 881 } 882 if (j == nz-1) { 883 tmp0 = tmps[*vi++]; 884 sum1 -= *v1++ *tmp0; 885 sum2 -= *v2++ *tmp0; 886 sum3 -= *v3++ *tmp0; 887 } 888 sum2 -= *v2++ * sum1; 889 sum3 -= *v3++ * sum1; 890 sum3 -= *v3++ * sum2; 891 892 tmp[row++]=sum1; 893 tmp[row++]=sum2; 894 tmp[row++]=sum3; 895 break; 896 897 case 4: 898 sum1 = b[*r++]; 899 sum2 = b[*r++]; 900 sum3 = b[*r++]; 901 sum4 = b[*r++]; 902 v2 = aa + ai[row+1]; 903 v3 = aa + ai[row+2]; 904 v4 = aa + ai[row+3]; 905 906 for (j=0; j<nz-1; j+=2) { 907 i0 = vi[0]; 908 i1 = vi[1]; 909 vi +=2; 910 tmp0 = tmps[i0]; 911 tmp1 = tmps[i1]; 912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 916 } 917 if (j == nz-1) { 918 tmp0 = tmps[*vi++]; 919 sum1 -= *v1++ *tmp0; 920 sum2 -= *v2++ *tmp0; 921 sum3 -= *v3++ *tmp0; 922 sum4 -= *v4++ *tmp0; 923 } 924 sum2 -= *v2++ * sum1; 925 sum3 -= *v3++ * sum1; 926 sum4 -= *v4++ * sum1; 927 sum3 -= *v3++ * sum2; 928 sum4 -= *v4++ * sum2; 929 sum4 -= *v4++ * sum3; 930 931 tmp[row++]=sum1; 932 tmp[row++]=sum2; 933 tmp[row++]=sum3; 934 tmp[row++]=sum4; 935 break; 936 case 5: 937 sum1 = b[*r++]; 938 sum2 = b[*r++]; 939 sum3 = b[*r++]; 940 sum4 = b[*r++]; 941 sum5 = b[*r++]; 942 v2 = aa + ai[row+1]; 943 v3 = aa + ai[row+2]; 944 v4 = aa + ai[row+3]; 945 v5 = aa + ai[row+4]; 946 947 for (j=0; j<nz-1; j+=2) { 948 i0 = vi[0]; 949 i1 = vi[1]; 950 vi +=2; 951 tmp0 = tmps[i0]; 952 tmp1 = tmps[i1]; 953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 954 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 955 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 957 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 958 } 959 if (j == nz-1) { 960 tmp0 = tmps[*vi++]; 961 sum1 -= *v1++ *tmp0; 962 sum2 -= *v2++ *tmp0; 963 sum3 -= *v3++ *tmp0; 964 sum4 -= *v4++ *tmp0; 965 sum5 -= *v5++ *tmp0; 966 } 967 968 sum2 -= *v2++ * sum1; 969 sum3 -= *v3++ * sum1; 970 sum4 -= *v4++ * sum1; 971 sum5 -= *v5++ * sum1; 972 sum3 -= *v3++ * sum2; 973 sum4 -= *v4++ * sum2; 974 sum5 -= *v5++ * sum2; 975 sum4 -= *v4++ * sum3; 976 sum5 -= *v5++ * sum3; 977 sum5 -= *v5++ * sum4; 978 979 tmp[row++]=sum1; 980 tmp[row++]=sum2; 981 tmp[row++]=sum3; 982 tmp[row++]=sum4; 983 tmp[row++]=sum5; 984 break; 985 default: 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 987 } 988 } 989 /* backward solve the upper triangular */ 990 for (i=node_max -1,row = n-1; i>=0; i--) { 991 nsz = ns[i]; 992 aii = ai[row+1] -1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii- ad[row]; 996 switch (nsz) { /* Each loop in 'case' is unrolled */ 997 case 1: 998 sum1 = tmp[row]; 999 1000 for (j=nz; j>1; j-=2) { 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j==1) { 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1014 break; 1015 case 2: 1016 sum1 = tmp[row]; 1017 sum2 = tmp[row -1]; 1018 v2 = aa + ai[row]-1; 1019 for (j=nz; j>1; j-=2) { 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1028 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1029 } 1030 if (j==1) { 1031 tmp0 = tmps[*vi--]; 1032 sum1 -= *v1-- * tmp0; 1033 sum2 -= *v2-- * tmp0; 1034 } 1035 1036 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1037 sum2 -= *v2-- * tmp0; 1038 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1039 break; 1040 case 3: 1041 sum1 = tmp[row]; 1042 sum2 = tmp[row -1]; 1043 sum3 = tmp[row -2]; 1044 v2 = aa + ai[row]-1; 1045 v3 = aa + ai[row -1]-1; 1046 for (j=nz; j>1; j-=2) { 1047 vi -=2; 1048 i0 = vi[2]; 1049 i1 = vi[1]; 1050 tmp0 = tmps[i0]; 1051 tmp1 = tmps[i1]; 1052 v1 -= 2; 1053 v2 -= 2; 1054 v3 -= 2; 1055 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1056 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1057 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1058 } 1059 if (j==1) { 1060 tmp0 = tmps[*vi--]; 1061 sum1 -= *v1-- * tmp0; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 } 1065 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1066 sum2 -= *v2-- * tmp0; 1067 sum3 -= *v3-- * tmp0; 1068 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1071 1072 break; 1073 case 4: 1074 sum1 = tmp[row]; 1075 sum2 = tmp[row -1]; 1076 sum3 = tmp[row -2]; 1077 sum4 = tmp[row -3]; 1078 v2 = aa + ai[row]-1; 1079 v3 = aa + ai[row -1]-1; 1080 v4 = aa + ai[row -2]-1; 1081 1082 for (j=nz; j>1; j-=2) { 1083 vi -=2; 1084 i0 = vi[2]; 1085 i1 = vi[1]; 1086 tmp0 = tmps[i0]; 1087 tmp1 = tmps[i1]; 1088 v1 -= 2; 1089 v2 -= 2; 1090 v3 -= 2; 1091 v4 -= 2; 1092 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1093 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1094 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1095 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1096 } 1097 if (j==1) { 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 } 1104 1105 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1113 sum4 -= *v4-- * tmp0; 1114 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1115 break; 1116 case 5: 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row -1]; 1119 sum3 = tmp[row -2]; 1120 sum4 = tmp[row -3]; 1121 sum5 = tmp[row -4]; 1122 v2 = aa + ai[row]-1; 1123 v3 = aa + ai[row -1]-1; 1124 v4 = aa + ai[row -2]-1; 1125 v5 = aa + ai[row -3]-1; 1126 for (j=nz; j>1; j-=2) { 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j==1) { 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1153 sum2 -= *v2-- * tmp0; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1162 sum4 -= *v4-- * tmp0; 1163 sum5 -= *v5-- * tmp0; 1164 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1165 sum5 -= *v5-- * tmp0; 1166 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1167 break; 1168 default: 1169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1170 } 1171 } 1172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #undef __FUNCT__ 1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1183 { 1184 Mat C =B; 1185 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1186 IS isrow = b->row,isicol = b->icol; 1187 PetscErrorCode ierr; 1188 const PetscInt *r,*ic,*ics; 1189 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1190 PetscInt i,j,k,nz,nzL,row,*pj; 1191 const PetscInt *ajtmp,*bjtmp; 1192 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1193 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1194 FactorShiftCtx sctx; 1195 const PetscInt *ddiag; 1196 PetscReal rs; 1197 MatScalar d; 1198 PetscInt inod,nodesz,node_max,col; 1199 const PetscInt *ns; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1216 if (rs>sctx.shift_top) sctx.shift_top = rs; 1217 } 1218 sctx.shift_top *= 1.1; 1219 sctx.nshift_max = 5; 1220 sctx.shift_lo = 0.; 1221 sctx.shift_hi = 1.; 1222 } 1223 1224 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1225 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1226 1227 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1228 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1229 rtmp2 = rtmp1 + n; 1230 rtmp3 = rtmp2 + n; 1231 rtmp4 = rtmp3 + n; 1232 ics = ic; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1237 1238 /* If max inode size > 4, split it into two inodes.*/ 1239 /* also map the inode sizes according to the ordering */ 1240 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1241 for (i=0,j=0; i<node_max; ++i,++j) { 1242 if (ns[i] > 4) { 1243 tmp_vec1[j] = 4; 1244 ++j; 1245 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1246 } else { 1247 tmp_vec1[j] = ns[i]; 1248 } 1249 } 1250 /* Use the correct node_max */ 1251 node_max = j; 1252 1253 /* Now reorder the inode info based on mat re-ordering info */ 1254 /* First create a row -> inode_size_array_index map */ 1255 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1256 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1257 for (i=0,row=0; i<node_max; i++) { 1258 nodesz = tmp_vec1[i]; 1259 for (j=0; j<nodesz; j++,row++) { 1260 nsmap[row] = i; 1261 } 1262 } 1263 /* Using nsmap, create a reordered ns structure */ 1264 for (i=0,j=0; i< node_max; i++) { 1265 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1266 tmp_vec2[i] = nodesz; 1267 j += nodesz; 1268 } 1269 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1270 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1271 1272 /* Now use the correct ns */ 1273 ns = tmp_vec2; 1274 1275 do { 1276 sctx.newshift = PETSC_FALSE; 1277 /* Now loop over each block-row, and do the factorization */ 1278 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1279 nodesz = ns[inod]; 1280 1281 switch (nodesz) { 1282 case 1: 1283 /*----------*/ 1284 /* zero rtmp1 */ 1285 /* L part */ 1286 nz = bi[i+1] - bi[i]; 1287 bjtmp = bj + bi[i]; 1288 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1289 1290 /* U part */ 1291 nz = bdiag[i]-bdiag[i+1]; 1292 bjtmp = bj + bdiag[i+1]+1; 1293 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1294 1295 /* load in initial (unfactored row) */ 1296 nz = ai[r[i]+1] - ai[r[i]]; 1297 ajtmp = aj + ai[r[i]]; 1298 v = aa + ai[r[i]]; 1299 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1300 1301 /* ZeropivotApply() */ 1302 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1303 1304 /* elimination */ 1305 bjtmp = bj + bi[i]; 1306 row = *bjtmp++; 1307 nzL = bi[i+1] - bi[i]; 1308 for (k=0; k < nzL; k++) { 1309 pc = rtmp1 + row; 1310 if (*pc != 0.0) { 1311 pv = b->a + bdiag[row]; 1312 mul1 = *pc * (*pv); 1313 *pc = mul1; 1314 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1315 pv = b->a + bdiag[row+1]+1; 1316 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1317 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1318 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1319 } 1320 row = *bjtmp++; 1321 } 1322 1323 /* finished row so stick it into b->a */ 1324 rs = 0.0; 1325 /* L part */ 1326 pv = b->a + bi[i]; 1327 pj = b->j + bi[i]; 1328 nz = bi[i+1] - bi[i]; 1329 for (j=0; j<nz; j++) { 1330 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1331 } 1332 1333 /* U part */ 1334 pv = b->a + bdiag[i+1]+1; 1335 pj = b->j + bdiag[i+1]+1; 1336 nz = bdiag[i] - bdiag[i+1]-1; 1337 for (j=0; j<nz; j++) { 1338 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1339 } 1340 1341 /* Check zero pivot */ 1342 sctx.rs = rs; 1343 sctx.pv = rtmp1[i]; 1344 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1345 if (sctx.newshift) break; 1346 1347 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1348 pv = b->a + bdiag[i]; 1349 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1350 break; 1351 1352 case 2: 1353 /*----------*/ 1354 /* zero rtmp1 and rtmp2 */ 1355 /* L part */ 1356 nz = bi[i+1] - bi[i]; 1357 bjtmp = bj + bi[i]; 1358 for (j=0; j<nz; j++) { 1359 col = bjtmp[j]; 1360 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1361 } 1362 1363 /* U part */ 1364 nz = bdiag[i]-bdiag[i+1]; 1365 bjtmp = bj + bdiag[i+1]+1; 1366 for (j=0; j<nz; j++) { 1367 col = bjtmp[j]; 1368 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1369 } 1370 1371 /* load in initial (unfactored row) */ 1372 nz = ai[r[i]+1] - ai[r[i]]; 1373 ajtmp = aj + ai[r[i]]; 1374 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1375 for (j=0; j<nz; j++) { 1376 col = ics[ajtmp[j]]; 1377 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1378 } 1379 /* ZeropivotApply(): shift the diagonal of the matrix */ 1380 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1381 1382 /* elimination */ 1383 bjtmp = bj + bi[i]; 1384 row = *bjtmp++; /* pivot row */ 1385 nzL = bi[i+1] - bi[i]; 1386 for (k=0; k < nzL; k++) { 1387 pc1 = rtmp1 + row; 1388 pc2 = rtmp2 + row; 1389 if (*pc1 != 0.0 || *pc2 != 0.0) { 1390 pv = b->a + bdiag[row]; 1391 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1392 *pc1 = mul1; *pc2 = mul2; 1393 1394 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1395 pv = b->a + bdiag[row+1]+1; 1396 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1397 for (j=0; j<nz; j++) { 1398 col = pj[j]; 1399 rtmp1[col] -= mul1 * pv[j]; 1400 rtmp2[col] -= mul2 * pv[j]; 1401 } 1402 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1403 } 1404 row = *bjtmp++; 1405 } 1406 1407 /* finished row i; check zero pivot, then stick row i into b->a */ 1408 rs = 0.0; 1409 /* L part */ 1410 pc1 = b->a + bi[i]; 1411 pj = b->j + bi[i]; 1412 nz = bi[i+1] - bi[i]; 1413 for (j=0; j<nz; j++) { 1414 col = pj[j]; 1415 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1416 } 1417 /* U part */ 1418 pc1 = b->a + bdiag[i+1]+1; 1419 pj = b->j + bdiag[i+1]+1; 1420 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1421 for (j=0; j<nz; j++) { 1422 col = pj[j]; 1423 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1424 } 1425 1426 sctx.rs = rs; 1427 sctx.pv = rtmp1[i]; 1428 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1429 if (sctx.newshift) break; 1430 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1431 *pc1 = 1.0/sctx.pv; 1432 1433 /* Now take care of diagonal 2x2 block. */ 1434 pc2 = rtmp2 + i; 1435 if (*pc2 != 0.0) { 1436 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1437 *pc2 = mul1; /* insert L entry */ 1438 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1439 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1442 } 1443 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1444 } 1445 1446 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1447 rs = 0.0; 1448 /* L part */ 1449 pc2 = b->a + bi[i+1]; 1450 pj = b->j + bi[i+1]; 1451 nz = bi[i+2] - bi[i+1]; 1452 for (j=0; j<nz; j++) { 1453 col = pj[j]; 1454 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1455 } 1456 /* U part */ 1457 pc2 = b->a + bdiag[i+2]+1; 1458 pj = b->j + bdiag[i+2]+1; 1459 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1460 for (j=0; j<nz; j++) { 1461 col = pj[j]; 1462 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1463 } 1464 1465 sctx.rs = rs; 1466 sctx.pv = rtmp2[i+1]; 1467 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1468 if (sctx.newshift) break; 1469 pc2 = b->a + bdiag[i+1]; 1470 *pc2 = 1.0/sctx.pv; 1471 break; 1472 1473 case 3: 1474 /*----------*/ 1475 /* zero rtmp */ 1476 /* L part */ 1477 nz = bi[i+1] - bi[i]; 1478 bjtmp = bj + bi[i]; 1479 for (j=0; j<nz; j++) { 1480 col = bjtmp[j]; 1481 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1482 } 1483 1484 /* U part */ 1485 nz = bdiag[i]-bdiag[i+1]; 1486 bjtmp = bj + bdiag[i+1]+1; 1487 for (j=0; j<nz; j++) { 1488 col = bjtmp[j]; 1489 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1490 } 1491 1492 /* load in initial (unfactored row) */ 1493 nz = ai[r[i]+1] - ai[r[i]]; 1494 ajtmp = aj + ai[r[i]]; 1495 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1496 for (j=0; j<nz; j++) { 1497 col = ics[ajtmp[j]]; 1498 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1499 } 1500 /* ZeropivotApply(): shift the diagonal of the matrix */ 1501 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1502 1503 /* elimination */ 1504 bjtmp = bj + bi[i]; 1505 row = *bjtmp++; /* pivot row */ 1506 nzL = bi[i+1] - bi[i]; 1507 for (k=0; k < nzL; k++) { 1508 pc1 = rtmp1 + row; 1509 pc2 = rtmp2 + row; 1510 pc3 = rtmp3 + row; 1511 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1512 pv = b->a + bdiag[row]; 1513 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1514 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1515 1516 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1517 pv = b->a + bdiag[row+1]+1; 1518 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1519 for (j=0; j<nz; j++) { 1520 col = pj[j]; 1521 rtmp1[col] -= mul1 * pv[j]; 1522 rtmp2[col] -= mul2 * pv[j]; 1523 rtmp3[col] -= mul3 * pv[j]; 1524 } 1525 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1526 } 1527 row = *bjtmp++; 1528 } 1529 1530 /* finished row i; check zero pivot, then stick row i into b->a */ 1531 rs = 0.0; 1532 /* L part */ 1533 pc1 = b->a + bi[i]; 1534 pj = b->j + bi[i]; 1535 nz = bi[i+1] - bi[i]; 1536 for (j=0; j<nz; j++) { 1537 col = pj[j]; 1538 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1539 } 1540 /* U part */ 1541 pc1 = b->a + bdiag[i+1]+1; 1542 pj = b->j + bdiag[i+1]+1; 1543 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1544 for (j=0; j<nz; j++) { 1545 col = pj[j]; 1546 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1547 } 1548 1549 sctx.rs = rs; 1550 sctx.pv = rtmp1[i]; 1551 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1552 if (sctx.newshift) break; 1553 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1554 *pc1 = 1.0/sctx.pv; 1555 1556 /* Now take care of 1st column of diagonal 3x3 block. */ 1557 pc2 = rtmp2 + i; 1558 pc3 = rtmp3 + i; 1559 if (*pc2 != 0.0 || *pc3 != 0.0) { 1560 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1561 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1562 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1563 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1564 for (j=0; j<nz; j++) { 1565 col = pj[j]; 1566 rtmp2[col] -= mul2 * rtmp1[col]; 1567 rtmp3[col] -= mul3 * rtmp1[col]; 1568 } 1569 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1570 } 1571 1572 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1573 rs = 0.0; 1574 /* L part */ 1575 pc2 = b->a + bi[i+1]; 1576 pj = b->j + bi[i+1]; 1577 nz = bi[i+2] - bi[i+1]; 1578 for (j=0; j<nz; j++) { 1579 col = pj[j]; 1580 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1581 } 1582 /* U part */ 1583 pc2 = b->a + bdiag[i+2]+1; 1584 pj = b->j + bdiag[i+2]+1; 1585 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1586 for (j=0; j<nz; j++) { 1587 col = pj[j]; 1588 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1589 } 1590 1591 sctx.rs = rs; 1592 sctx.pv = rtmp2[i+1]; 1593 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1594 if (sctx.newshift) break; 1595 pc2 = b->a + bdiag[i+1]; 1596 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1597 1598 /* Now take care of 2nd column of diagonal 3x3 block. */ 1599 pc3 = rtmp3 + i+1; 1600 if (*pc3 != 0.0) { 1601 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1602 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1603 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1604 for (j=0; j<nz; j++) { 1605 col = pj[j]; 1606 rtmp3[col] -= mul3 * rtmp2[col]; 1607 } 1608 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1609 } 1610 1611 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1612 rs = 0.0; 1613 /* L part */ 1614 pc3 = b->a + bi[i+2]; 1615 pj = b->j + bi[i+2]; 1616 nz = bi[i+3] - bi[i+2]; 1617 for (j=0; j<nz; j++) { 1618 col = pj[j]; 1619 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1620 } 1621 /* U part */ 1622 pc3 = b->a + bdiag[i+3]+1; 1623 pj = b->j + bdiag[i+3]+1; 1624 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1625 for (j=0; j<nz; j++) { 1626 col = pj[j]; 1627 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1628 } 1629 1630 sctx.rs = rs; 1631 sctx.pv = rtmp3[i+2]; 1632 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1633 if (sctx.newshift) break; 1634 pc3 = b->a + bdiag[i+2]; 1635 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1636 break; 1637 case 4: 1638 /*----------*/ 1639 /* zero rtmp */ 1640 /* L part */ 1641 nz = bi[i+1] - bi[i]; 1642 bjtmp = bj + bi[i]; 1643 for (j=0; j<nz; j++) { 1644 col = bjtmp[j]; 1645 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1646 } 1647 1648 /* U part */ 1649 nz = bdiag[i]-bdiag[i+1]; 1650 bjtmp = bj + bdiag[i+1]+1; 1651 for (j=0; j<nz; j++) { 1652 col = bjtmp[j]; 1653 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1654 } 1655 1656 /* load in initial (unfactored row) */ 1657 nz = ai[r[i]+1] - ai[r[i]]; 1658 ajtmp = aj + ai[r[i]]; 1659 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1660 for (j=0; j<nz; j++) { 1661 col = ics[ajtmp[j]]; 1662 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1663 } 1664 /* ZeropivotApply(): shift the diagonal of the matrix */ 1665 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1666 1667 /* elimination */ 1668 bjtmp = bj + bi[i]; 1669 row = *bjtmp++; /* pivot row */ 1670 nzL = bi[i+1] - bi[i]; 1671 for (k=0; k < nzL; k++) { 1672 pc1 = rtmp1 + row; 1673 pc2 = rtmp2 + row; 1674 pc3 = rtmp3 + row; 1675 pc4 = rtmp4 + row; 1676 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1677 pv = b->a + bdiag[row]; 1678 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1679 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1680 1681 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1682 pv = b->a + bdiag[row+1]+1; 1683 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1684 for (j=0; j<nz; j++) { 1685 col = pj[j]; 1686 rtmp1[col] -= mul1 * pv[j]; 1687 rtmp2[col] -= mul2 * pv[j]; 1688 rtmp3[col] -= mul3 * pv[j]; 1689 rtmp4[col] -= mul4 * pv[j]; 1690 } 1691 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1692 } 1693 row = *bjtmp++; 1694 } 1695 1696 /* finished row i; check zero pivot, then stick row i into b->a */ 1697 rs = 0.0; 1698 /* L part */ 1699 pc1 = b->a + bi[i]; 1700 pj = b->j + bi[i]; 1701 nz = bi[i+1] - bi[i]; 1702 for (j=0; j<nz; j++) { 1703 col = pj[j]; 1704 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1705 } 1706 /* U part */ 1707 pc1 = b->a + bdiag[i+1]+1; 1708 pj = b->j + bdiag[i+1]+1; 1709 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1710 for (j=0; j<nz; j++) { 1711 col = pj[j]; 1712 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1713 } 1714 1715 sctx.rs = rs; 1716 sctx.pv = rtmp1[i]; 1717 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1718 if (sctx.newshift) break; 1719 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1720 *pc1 = 1.0/sctx.pv; 1721 1722 /* Now take care of 1st column of diagonal 4x4 block. */ 1723 pc2 = rtmp2 + i; 1724 pc3 = rtmp3 + i; 1725 pc4 = rtmp4 + i; 1726 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1727 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1728 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1729 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1730 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1731 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1732 for (j=0; j<nz; j++) { 1733 col = pj[j]; 1734 rtmp2[col] -= mul2 * rtmp1[col]; 1735 rtmp3[col] -= mul3 * rtmp1[col]; 1736 rtmp4[col] -= mul4 * rtmp1[col]; 1737 } 1738 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1739 } 1740 1741 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1742 rs = 0.0; 1743 /* L part */ 1744 pc2 = b->a + bi[i+1]; 1745 pj = b->j + bi[i+1]; 1746 nz = bi[i+2] - bi[i+1]; 1747 for (j=0; j<nz; j++) { 1748 col = pj[j]; 1749 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1750 } 1751 /* U part */ 1752 pc2 = b->a + bdiag[i+2]+1; 1753 pj = b->j + bdiag[i+2]+1; 1754 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1755 for (j=0; j<nz; j++) { 1756 col = pj[j]; 1757 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1758 } 1759 1760 sctx.rs = rs; 1761 sctx.pv = rtmp2[i+1]; 1762 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1763 if (sctx.newshift) break; 1764 pc2 = b->a + bdiag[i+1]; 1765 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1766 1767 /* Now take care of 2nd column of diagonal 4x4 block. */ 1768 pc3 = rtmp3 + i+1; 1769 pc4 = rtmp4 + i+1; 1770 if (*pc3 != 0.0 || *pc4 != 0.0) { 1771 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1772 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1773 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1774 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1775 for (j=0; j<nz; j++) { 1776 col = pj[j]; 1777 rtmp3[col] -= mul3 * rtmp2[col]; 1778 rtmp4[col] -= mul4 * rtmp2[col]; 1779 } 1780 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1781 } 1782 1783 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1784 rs = 0.0; 1785 /* L part */ 1786 pc3 = b->a + bi[i+2]; 1787 pj = b->j + bi[i+2]; 1788 nz = bi[i+3] - bi[i+2]; 1789 for (j=0; j<nz; j++) { 1790 col = pj[j]; 1791 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1792 } 1793 /* U part */ 1794 pc3 = b->a + bdiag[i+3]+1; 1795 pj = b->j + bdiag[i+3]+1; 1796 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1797 for (j=0; j<nz; j++) { 1798 col = pj[j]; 1799 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1800 } 1801 1802 sctx.rs = rs; 1803 sctx.pv = rtmp3[i+2]; 1804 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1805 if (sctx.newshift) break; 1806 pc3 = b->a + bdiag[i+2]; 1807 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1808 1809 /* Now take care of 3rd column of diagonal 4x4 block. */ 1810 pc4 = rtmp4 + i+2; 1811 if (*pc4 != 0.0) { 1812 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1813 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1814 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1815 for (j=0; j<nz; j++) { 1816 col = pj[j]; 1817 rtmp4[col] -= mul4 * rtmp3[col]; 1818 } 1819 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1820 } 1821 1822 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1823 rs = 0.0; 1824 /* L part */ 1825 pc4 = b->a + bi[i+3]; 1826 pj = b->j + bi[i+3]; 1827 nz = bi[i+4] - bi[i+3]; 1828 for (j=0; j<nz; j++) { 1829 col = pj[j]; 1830 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1831 } 1832 /* U part */ 1833 pc4 = b->a + bdiag[i+4]+1; 1834 pj = b->j + bdiag[i+4]+1; 1835 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1836 for (j=0; j<nz; j++) { 1837 col = pj[j]; 1838 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1839 } 1840 1841 sctx.rs = rs; 1842 sctx.pv = rtmp4[i+3]; 1843 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1844 if (sctx.newshift) break; 1845 pc4 = b->a + bdiag[i+3]; 1846 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1847 break; 1848 1849 default: 1850 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1851 } 1852 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1853 i += nodesz; /* Update the row */ 1854 } 1855 1856 /* MatPivotRefine() */ 1857 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1858 /* 1859 * if no shift in this attempt & shifting & started shifting & can refine, 1860 * then try lower shift 1861 */ 1862 sctx.shift_hi = sctx.shift_fraction; 1863 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1864 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1865 sctx.newshift = PETSC_TRUE; 1866 sctx.nshift++; 1867 } 1868 } while (sctx.newshift); 1869 1870 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1871 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1872 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1873 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1874 1875 if (b->inode.size) { 1876 C->ops->solve = MatSolve_SeqAIJ_Inode; 1877 } else { 1878 C->ops->solve = MatSolve_SeqAIJ; 1879 } 1880 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1881 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1882 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1883 C->ops->matsolve = MatMatSolve_SeqAIJ; 1884 C->assembled = PETSC_TRUE; 1885 C->preallocated = PETSC_TRUE; 1886 1887 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1888 1889 /* MatShiftView(A,info,&sctx) */ 1890 if (sctx.nshift) { 1891 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1892 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1893 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1894 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1895 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1896 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1897 } 1898 } 1899 PetscFunctionReturn(0); 1900 } 1901 1902 #undef __FUNCT__ 1903 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1904 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1905 { 1906 Mat C = B; 1907 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1908 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1909 PetscErrorCode ierr; 1910 const PetscInt *r,*ic,*c,*ics; 1911 PetscInt n = A->rmap->n,*bi = b->i; 1912 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1913 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1914 PetscInt *ai = a->i,*aj = a->j; 1915 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1916 PetscScalar mul1,mul2,mul3,tmp; 1917 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1918 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1919 PetscReal rs=0.0; 1920 FactorShiftCtx sctx; 1921 1922 PetscFunctionBegin; 1923 sctx.shift_top = 0; 1924 sctx.nshift_max = 0; 1925 sctx.shift_lo = 0; 1926 sctx.shift_hi = 0; 1927 sctx.shift_fraction = 0; 1928 1929 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1930 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1931 sctx.shift_top = 0; 1932 for (i=0; i<n; i++) { 1933 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1934 rs = 0.0; 1935 ajtmp = aj + ai[i]; 1936 rtmp1 = aa + ai[i]; 1937 nz = ai[i+1] - ai[i]; 1938 for (j=0; j<nz; j++) { 1939 if (*ajtmp != i) { 1940 rs += PetscAbsScalar(*rtmp1++); 1941 } else { 1942 rs -= PetscRealPart(*rtmp1++); 1943 } 1944 ajtmp++; 1945 } 1946 if (rs>sctx.shift_top) sctx.shift_top = rs; 1947 } 1948 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1949 sctx.shift_top *= 1.1; 1950 sctx.nshift_max = 5; 1951 sctx.shift_lo = 0.; 1952 sctx.shift_hi = 1.; 1953 } 1954 sctx.shift_amount = 0; 1955 sctx.nshift = 0; 1956 1957 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1958 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1959 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1960 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1961 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1962 ics = ic; 1963 rtmp22 = rtmp11 + n; 1964 rtmp33 = rtmp22 + n; 1965 1966 node_max = a->inode.node_count; 1967 ns = a->inode.size; 1968 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1969 1970 /* If max inode size > 3, split it into two inodes.*/ 1971 /* also map the inode sizes according to the ordering */ 1972 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1973 for (i=0,j=0; i<node_max; ++i,++j) { 1974 if (ns[i]>3) { 1975 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1976 ++j; 1977 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1978 } else { 1979 tmp_vec1[j] = ns[i]; 1980 } 1981 } 1982 /* Use the correct node_max */ 1983 node_max = j; 1984 1985 /* Now reorder the inode info based on mat re-ordering info */ 1986 /* First create a row -> inode_size_array_index map */ 1987 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1988 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1989 for (i=0,row=0; i<node_max; i++) { 1990 nodesz = tmp_vec1[i]; 1991 for (j=0; j<nodesz; j++,row++) { 1992 nsmap[row] = i; 1993 } 1994 } 1995 /* Using nsmap, create a reordered ns structure */ 1996 for (i=0,j=0; i< node_max; i++) { 1997 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1998 tmp_vec2[i] = nodesz; 1999 j += nodesz; 2000 } 2001 ierr = PetscFree(nsmap);CHKERRQ(ierr); 2002 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2003 /* Now use the correct ns */ 2004 ns = tmp_vec2; 2005 2006 do { 2007 sctx.newshift = PETSC_FALSE; 2008 /* Now loop over each block-row, and do the factorization */ 2009 for (i=0,row=0; i<node_max; i++) { 2010 nodesz = ns[i]; 2011 nz = bi[row+1] - bi[row]; 2012 bjtmp = bj + bi[row]; 2013 2014 switch (nodesz) { 2015 case 1: 2016 for (j=0; j<nz; j++) { 2017 idx = bjtmp[j]; 2018 rtmp11[idx] = 0.0; 2019 } 2020 2021 /* load in initial (unfactored row) */ 2022 idx = r[row]; 2023 nz_tmp = ai[idx+1] - ai[idx]; 2024 ajtmp = aj + ai[idx]; 2025 v1 = aa + ai[idx]; 2026 2027 for (j=0; j<nz_tmp; j++) { 2028 idx = ics[ajtmp[j]]; 2029 rtmp11[idx] = v1[j]; 2030 } 2031 rtmp11[ics[r[row]]] += sctx.shift_amount; 2032 2033 prow = *bjtmp++; 2034 while (prow < row) { 2035 pc1 = rtmp11 + prow; 2036 if (*pc1 != 0.0) { 2037 pv = ba + bd[prow]; 2038 pj = nbj + bd[prow]; 2039 mul1 = *pc1 * *pv++; 2040 *pc1 = mul1; 2041 nz_tmp = bi[prow+1] - bd[prow] - 1; 2042 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2043 for (j=0; j<nz_tmp; j++) { 2044 tmp = pv[j]; 2045 idx = pj[j]; 2046 rtmp11[idx] -= mul1 * tmp; 2047 } 2048 } 2049 prow = *bjtmp++; 2050 } 2051 pj = bj + bi[row]; 2052 pc1 = ba + bi[row]; 2053 2054 sctx.pv = rtmp11[row]; 2055 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2056 rs = 0.0; 2057 for (j=0; j<nz; j++) { 2058 idx = pj[j]; 2059 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2060 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2061 } 2062 sctx.rs = rs; 2063 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2064 if (sctx.newshift) goto endofwhile; 2065 break; 2066 2067 case 2: 2068 for (j=0; j<nz; j++) { 2069 idx = bjtmp[j]; 2070 rtmp11[idx] = 0.0; 2071 rtmp22[idx] = 0.0; 2072 } 2073 2074 /* load in initial (unfactored row) */ 2075 idx = r[row]; 2076 nz_tmp = ai[idx+1] - ai[idx]; 2077 ajtmp = aj + ai[idx]; 2078 v1 = aa + ai[idx]; 2079 v2 = aa + ai[idx+1]; 2080 for (j=0; j<nz_tmp; j++) { 2081 idx = ics[ajtmp[j]]; 2082 rtmp11[idx] = v1[j]; 2083 rtmp22[idx] = v2[j]; 2084 } 2085 rtmp11[ics[r[row]]] += sctx.shift_amount; 2086 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2087 2088 prow = *bjtmp++; 2089 while (prow < row) { 2090 pc1 = rtmp11 + prow; 2091 pc2 = rtmp22 + prow; 2092 if (*pc1 != 0.0 || *pc2 != 0.0) { 2093 pv = ba + bd[prow]; 2094 pj = nbj + bd[prow]; 2095 mul1 = *pc1 * *pv; 2096 mul2 = *pc2 * *pv; 2097 ++pv; 2098 *pc1 = mul1; 2099 *pc2 = mul2; 2100 2101 nz_tmp = bi[prow+1] - bd[prow] - 1; 2102 for (j=0; j<nz_tmp; j++) { 2103 tmp = pv[j]; 2104 idx = pj[j]; 2105 rtmp11[idx] -= mul1 * tmp; 2106 rtmp22[idx] -= mul2 * tmp; 2107 } 2108 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2109 } 2110 prow = *bjtmp++; 2111 } 2112 2113 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2114 pc1 = rtmp11 + prow; 2115 pc2 = rtmp22 + prow; 2116 2117 sctx.pv = *pc1; 2118 pj = bj + bi[prow]; 2119 rs = 0.0; 2120 for (j=0; j<nz; j++) { 2121 idx = pj[j]; 2122 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2123 } 2124 sctx.rs = rs; 2125 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2126 if (sctx.newshift) goto endofwhile; 2127 2128 if (*pc2 != 0.0) { 2129 pj = nbj + bd[prow]; 2130 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2131 *pc2 = mul2; 2132 nz_tmp = bi[prow+1] - bd[prow] - 1; 2133 for (j=0; j<nz_tmp; j++) { 2134 idx = pj[j]; 2135 tmp = rtmp11[idx]; 2136 rtmp22[idx] -= mul2 * tmp; 2137 } 2138 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2139 } 2140 2141 pj = bj + bi[row]; 2142 pc1 = ba + bi[row]; 2143 pc2 = ba + bi[row+1]; 2144 2145 sctx.pv = rtmp22[row+1]; 2146 rs = 0.0; 2147 rtmp11[row] = 1.0/rtmp11[row]; 2148 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2149 /* copy row entries from dense representation to sparse */ 2150 for (j=0; j<nz; j++) { 2151 idx = pj[j]; 2152 pc1[j] = rtmp11[idx]; 2153 pc2[j] = rtmp22[idx]; 2154 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2155 } 2156 sctx.rs = rs; 2157 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2158 if (sctx.newshift) goto endofwhile; 2159 break; 2160 2161 case 3: 2162 for (j=0; j<nz; j++) { 2163 idx = bjtmp[j]; 2164 rtmp11[idx] = 0.0; 2165 rtmp22[idx] = 0.0; 2166 rtmp33[idx] = 0.0; 2167 } 2168 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2169 idx = r[row]; 2170 nz_tmp = ai[idx+1] - ai[idx]; 2171 ajtmp = aj + ai[idx]; 2172 v1 = aa + ai[idx]; 2173 v2 = aa + ai[idx+1]; 2174 v3 = aa + ai[idx+2]; 2175 for (j=0; j<nz_tmp; j++) { 2176 idx = ics[ajtmp[j]]; 2177 rtmp11[idx] = v1[j]; 2178 rtmp22[idx] = v2[j]; 2179 rtmp33[idx] = v3[j]; 2180 } 2181 rtmp11[ics[r[row]]] += sctx.shift_amount; 2182 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2183 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2184 2185 /* loop over all pivot row blocks above this row block */ 2186 prow = *bjtmp++; 2187 while (prow < row) { 2188 pc1 = rtmp11 + prow; 2189 pc2 = rtmp22 + prow; 2190 pc3 = rtmp33 + prow; 2191 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2192 pv = ba + bd[prow]; 2193 pj = nbj + bd[prow]; 2194 mul1 = *pc1 * *pv; 2195 mul2 = *pc2 * *pv; 2196 mul3 = *pc3 * *pv; 2197 ++pv; 2198 *pc1 = mul1; 2199 *pc2 = mul2; 2200 *pc3 = mul3; 2201 2202 nz_tmp = bi[prow+1] - bd[prow] - 1; 2203 /* update this row based on pivot row */ 2204 for (j=0; j<nz_tmp; j++) { 2205 tmp = pv[j]; 2206 idx = pj[j]; 2207 rtmp11[idx] -= mul1 * tmp; 2208 rtmp22[idx] -= mul2 * tmp; 2209 rtmp33[idx] -= mul3 * tmp; 2210 } 2211 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2212 } 2213 prow = *bjtmp++; 2214 } 2215 2216 /* Now take care of diagonal 3x3 block in this set of rows */ 2217 /* note: prow = row here */ 2218 pc1 = rtmp11 + prow; 2219 pc2 = rtmp22 + prow; 2220 pc3 = rtmp33 + prow; 2221 2222 sctx.pv = *pc1; 2223 pj = bj + bi[prow]; 2224 rs = 0.0; 2225 for (j=0; j<nz; j++) { 2226 idx = pj[j]; 2227 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2228 } 2229 sctx.rs = rs; 2230 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2231 if (sctx.newshift) goto endofwhile; 2232 2233 if (*pc2 != 0.0 || *pc3 != 0.0) { 2234 mul2 = (*pc2)/(*pc1); 2235 mul3 = (*pc3)/(*pc1); 2236 *pc2 = mul2; 2237 *pc3 = mul3; 2238 nz_tmp = bi[prow+1] - bd[prow] - 1; 2239 pj = nbj + bd[prow]; 2240 for (j=0; j<nz_tmp; j++) { 2241 idx = pj[j]; 2242 tmp = rtmp11[idx]; 2243 rtmp22[idx] -= mul2 * tmp; 2244 rtmp33[idx] -= mul3 * tmp; 2245 } 2246 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2247 } 2248 ++prow; 2249 2250 pc2 = rtmp22 + prow; 2251 pc3 = rtmp33 + prow; 2252 sctx.pv = *pc2; 2253 pj = bj + bi[prow]; 2254 rs = 0.0; 2255 for (j=0; j<nz; j++) { 2256 idx = pj[j]; 2257 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2258 } 2259 sctx.rs = rs; 2260 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2261 if (sctx.newshift) goto endofwhile; 2262 2263 if (*pc3 != 0.0) { 2264 mul3 = (*pc3)/(*pc2); 2265 *pc3 = mul3; 2266 pj = nbj + bd[prow]; 2267 nz_tmp = bi[prow+1] - bd[prow] - 1; 2268 for (j=0; j<nz_tmp; j++) { 2269 idx = pj[j]; 2270 tmp = rtmp22[idx]; 2271 rtmp33[idx] -= mul3 * tmp; 2272 } 2273 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2274 } 2275 2276 pj = bj + bi[row]; 2277 pc1 = ba + bi[row]; 2278 pc2 = ba + bi[row+1]; 2279 pc3 = ba + bi[row+2]; 2280 2281 sctx.pv = rtmp33[row+2]; 2282 rs = 0.0; 2283 rtmp11[row] = 1.0/rtmp11[row]; 2284 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2285 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2286 /* copy row entries from dense representation to sparse */ 2287 for (j=0; j<nz; j++) { 2288 idx = pj[j]; 2289 pc1[j] = rtmp11[idx]; 2290 pc2[j] = rtmp22[idx]; 2291 pc3[j] = rtmp33[idx]; 2292 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2293 } 2294 2295 sctx.rs = rs; 2296 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2297 if (sctx.newshift) goto endofwhile; 2298 break; 2299 2300 default: 2301 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2302 } 2303 row += nodesz; /* Update the row */ 2304 } 2305 endofwhile:; 2306 } while (sctx.newshift); 2307 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2308 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2309 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2310 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2311 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2312 2313 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2314 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2315 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2316 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2317 C->assembled = PETSC_TRUE; 2318 C->preallocated = PETSC_TRUE; 2319 if (sctx.nshift) { 2320 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2321 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2322 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2323 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2324 } 2325 } 2326 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2327 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2328 PetscFunctionReturn(0); 2329 } 2330 2331 2332 /* ----------------------------------------------------------- */ 2333 #undef __FUNCT__ 2334 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2335 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2336 { 2337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2338 IS iscol = a->col,isrow = a->row; 2339 PetscErrorCode ierr; 2340 const PetscInt *r,*c,*rout,*cout; 2341 PetscInt i,j,n = A->rmap->n; 2342 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2343 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2344 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2345 PetscScalar sum1,sum2,sum3,sum4,sum5; 2346 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2347 const PetscScalar *b; 2348 2349 PetscFunctionBegin; 2350 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2351 node_max = a->inode.node_count; 2352 ns = a->inode.size; /* Node Size array */ 2353 2354 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2355 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2356 tmp = a->solve_work; 2357 2358 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2359 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2360 2361 /* forward solve the lower triangular */ 2362 tmps = tmp; 2363 aa = a_a; 2364 aj = a_j; 2365 ad = a->diag; 2366 2367 for (i = 0,row = 0; i< node_max; ++i) { 2368 nsz = ns[i]; 2369 aii = ai[row]; 2370 v1 = aa + aii; 2371 vi = aj + aii; 2372 nz = ai[row+1]- ai[row]; 2373 2374 if (i < node_max-1) { 2375 /* Prefetch the indices for the next block */ 2376 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2377 /* Prefetch the data for the next block */ 2378 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2379 } 2380 2381 switch (nsz) { /* Each loop in 'case' is unrolled */ 2382 case 1: 2383 sum1 = b[r[row]]; 2384 for (j=0; j<nz-1; j+=2) { 2385 i0 = vi[j]; 2386 i1 = vi[j+1]; 2387 tmp0 = tmps[i0]; 2388 tmp1 = tmps[i1]; 2389 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2390 } 2391 if (j == nz-1) { 2392 tmp0 = tmps[vi[j]]; 2393 sum1 -= v1[j]*tmp0; 2394 } 2395 tmp[row++]=sum1; 2396 break; 2397 case 2: 2398 sum1 = b[r[row]]; 2399 sum2 = b[r[row+1]]; 2400 v2 = aa + ai[row+1]; 2401 2402 for (j=0; j<nz-1; j+=2) { 2403 i0 = vi[j]; 2404 i1 = vi[j+1]; 2405 tmp0 = tmps[i0]; 2406 tmp1 = tmps[i1]; 2407 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2408 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2409 } 2410 if (j == nz-1) { 2411 tmp0 = tmps[vi[j]]; 2412 sum1 -= v1[j] *tmp0; 2413 sum2 -= v2[j] *tmp0; 2414 } 2415 sum2 -= v2[nz] * sum1; 2416 tmp[row++]=sum1; 2417 tmp[row++]=sum2; 2418 break; 2419 case 3: 2420 sum1 = b[r[row]]; 2421 sum2 = b[r[row+1]]; 2422 sum3 = b[r[row+2]]; 2423 v2 = aa + ai[row+1]; 2424 v3 = aa + ai[row+2]; 2425 2426 for (j=0; j<nz-1; j+=2) { 2427 i0 = vi[j]; 2428 i1 = vi[j+1]; 2429 tmp0 = tmps[i0]; 2430 tmp1 = tmps[i1]; 2431 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2432 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2433 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2434 } 2435 if (j == nz-1) { 2436 tmp0 = tmps[vi[j]]; 2437 sum1 -= v1[j] *tmp0; 2438 sum2 -= v2[j] *tmp0; 2439 sum3 -= v3[j] *tmp0; 2440 } 2441 sum2 -= v2[nz] * sum1; 2442 sum3 -= v3[nz] * sum1; 2443 sum3 -= v3[nz+1] * sum2; 2444 tmp[row++]=sum1; 2445 tmp[row++]=sum2; 2446 tmp[row++]=sum3; 2447 break; 2448 2449 case 4: 2450 sum1 = b[r[row]]; 2451 sum2 = b[r[row+1]]; 2452 sum3 = b[r[row+2]]; 2453 sum4 = b[r[row+3]]; 2454 v2 = aa + ai[row+1]; 2455 v3 = aa + ai[row+2]; 2456 v4 = aa + ai[row+3]; 2457 2458 for (j=0; j<nz-1; j+=2) { 2459 i0 = vi[j]; 2460 i1 = vi[j+1]; 2461 tmp0 = tmps[i0]; 2462 tmp1 = tmps[i1]; 2463 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2464 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2465 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2466 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2467 } 2468 if (j == nz-1) { 2469 tmp0 = tmps[vi[j]]; 2470 sum1 -= v1[j] *tmp0; 2471 sum2 -= v2[j] *tmp0; 2472 sum3 -= v3[j] *tmp0; 2473 sum4 -= v4[j] *tmp0; 2474 } 2475 sum2 -= v2[nz] * sum1; 2476 sum3 -= v3[nz] * sum1; 2477 sum4 -= v4[nz] * sum1; 2478 sum3 -= v3[nz+1] * sum2; 2479 sum4 -= v4[nz+1] * sum2; 2480 sum4 -= v4[nz+2] * sum3; 2481 2482 tmp[row++]=sum1; 2483 tmp[row++]=sum2; 2484 tmp[row++]=sum3; 2485 tmp[row++]=sum4; 2486 break; 2487 case 5: 2488 sum1 = b[r[row]]; 2489 sum2 = b[r[row+1]]; 2490 sum3 = b[r[row+2]]; 2491 sum4 = b[r[row+3]]; 2492 sum5 = b[r[row+4]]; 2493 v2 = aa + ai[row+1]; 2494 v3 = aa + ai[row+2]; 2495 v4 = aa + ai[row+3]; 2496 v5 = aa + ai[row+4]; 2497 2498 for (j=0; j<nz-1; j+=2) { 2499 i0 = vi[j]; 2500 i1 = vi[j+1]; 2501 tmp0 = tmps[i0]; 2502 tmp1 = tmps[i1]; 2503 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2504 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2505 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2506 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2507 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2508 } 2509 if (j == nz-1) { 2510 tmp0 = tmps[vi[j]]; 2511 sum1 -= v1[j] *tmp0; 2512 sum2 -= v2[j] *tmp0; 2513 sum3 -= v3[j] *tmp0; 2514 sum4 -= v4[j] *tmp0; 2515 sum5 -= v5[j] *tmp0; 2516 } 2517 2518 sum2 -= v2[nz] * sum1; 2519 sum3 -= v3[nz] * sum1; 2520 sum4 -= v4[nz] * sum1; 2521 sum5 -= v5[nz] * sum1; 2522 sum3 -= v3[nz+1] * sum2; 2523 sum4 -= v4[nz+1] * sum2; 2524 sum5 -= v5[nz+1] * sum2; 2525 sum4 -= v4[nz+2] * sum3; 2526 sum5 -= v5[nz+2] * sum3; 2527 sum5 -= v5[nz+3] * sum4; 2528 2529 tmp[row++]=sum1; 2530 tmp[row++]=sum2; 2531 tmp[row++]=sum3; 2532 tmp[row++]=sum4; 2533 tmp[row++]=sum5; 2534 break; 2535 default: 2536 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2537 } 2538 } 2539 /* backward solve the upper triangular */ 2540 for (i=node_max -1,row = n-1; i>=0; i--) { 2541 nsz = ns[i]; 2542 aii = ad[row+1] + 1; 2543 v1 = aa + aii; 2544 vi = aj + aii; 2545 nz = ad[row]- ad[row+1] - 1; 2546 2547 if (i > 0) { 2548 /* Prefetch the indices for the next block */ 2549 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2550 /* Prefetch the data for the next block */ 2551 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2552 } 2553 2554 switch (nsz) { /* Each loop in 'case' is unrolled */ 2555 case 1: 2556 sum1 = tmp[row]; 2557 2558 for (j=0; j<nz-1; j+=2) { 2559 i0 = vi[j]; 2560 i1 = vi[j+1]; 2561 tmp0 = tmps[i0]; 2562 tmp1 = tmps[i1]; 2563 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2564 } 2565 if (j == nz-1) { 2566 tmp0 = tmps[vi[j]]; 2567 sum1 -= v1[j]*tmp0; 2568 } 2569 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2570 break; 2571 case 2: 2572 sum1 = tmp[row]; 2573 sum2 = tmp[row-1]; 2574 v2 = aa + ad[row] + 1; 2575 for (j=0; j<nz-1; j+=2) { 2576 i0 = vi[j]; 2577 i1 = vi[j+1]; 2578 tmp0 = tmps[i0]; 2579 tmp1 = tmps[i1]; 2580 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2581 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2582 } 2583 if (j == nz-1) { 2584 tmp0 = tmps[vi[j]]; 2585 sum1 -= v1[j]* tmp0; 2586 sum2 -= v2[j+1]* tmp0; 2587 } 2588 2589 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2590 sum2 -= v2[0] * tmp0; 2591 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2592 break; 2593 case 3: 2594 sum1 = tmp[row]; 2595 sum2 = tmp[row -1]; 2596 sum3 = tmp[row -2]; 2597 v2 = aa + ad[row] + 1; 2598 v3 = aa + ad[row -1] + 1; 2599 for (j=0; j<nz-1; j+=2) { 2600 i0 = vi[j]; 2601 i1 = vi[j+1]; 2602 tmp0 = tmps[i0]; 2603 tmp1 = tmps[i1]; 2604 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2605 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2606 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2607 } 2608 if (j== nz-1) { 2609 tmp0 = tmps[vi[j]]; 2610 sum1 -= v1[j] * tmp0; 2611 sum2 -= v2[j+1] * tmp0; 2612 sum3 -= v3[j+2] * tmp0; 2613 } 2614 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2615 sum2 -= v2[0]* tmp0; 2616 sum3 -= v3[1] * tmp0; 2617 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2618 sum3 -= v3[0]* tmp0; 2619 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2620 2621 break; 2622 case 4: 2623 sum1 = tmp[row]; 2624 sum2 = tmp[row -1]; 2625 sum3 = tmp[row -2]; 2626 sum4 = tmp[row -3]; 2627 v2 = aa + ad[row]+1; 2628 v3 = aa + ad[row -1]+1; 2629 v4 = aa + ad[row -2]+1; 2630 2631 for (j=0; j<nz-1; j+=2) { 2632 i0 = vi[j]; 2633 i1 = vi[j+1]; 2634 tmp0 = tmps[i0]; 2635 tmp1 = tmps[i1]; 2636 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2637 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2638 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2639 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2640 } 2641 if (j== nz-1) { 2642 tmp0 = tmps[vi[j]]; 2643 sum1 -= v1[j] * tmp0; 2644 sum2 -= v2[j+1] * tmp0; 2645 sum3 -= v3[j+2] * tmp0; 2646 sum4 -= v4[j+3] * tmp0; 2647 } 2648 2649 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2650 sum2 -= v2[0] * tmp0; 2651 sum3 -= v3[1] * tmp0; 2652 sum4 -= v4[2] * tmp0; 2653 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2654 sum3 -= v3[0] * tmp0; 2655 sum4 -= v4[1] * tmp0; 2656 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2657 sum4 -= v4[0] * tmp0; 2658 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2659 break; 2660 case 5: 2661 sum1 = tmp[row]; 2662 sum2 = tmp[row -1]; 2663 sum3 = tmp[row -2]; 2664 sum4 = tmp[row -3]; 2665 sum5 = tmp[row -4]; 2666 v2 = aa + ad[row]+1; 2667 v3 = aa + ad[row -1]+1; 2668 v4 = aa + ad[row -2]+1; 2669 v5 = aa + ad[row -3]+1; 2670 for (j=0; j<nz-1; j+=2) { 2671 i0 = vi[j]; 2672 i1 = vi[j+1]; 2673 tmp0 = tmps[i0]; 2674 tmp1 = tmps[i1]; 2675 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2676 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2677 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2678 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2679 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2680 } 2681 if (j==nz-1) { 2682 tmp0 = tmps[vi[j]]; 2683 sum1 -= v1[j] * tmp0; 2684 sum2 -= v2[j+1] * tmp0; 2685 sum3 -= v3[j+2] * tmp0; 2686 sum4 -= v4[j+3] * tmp0; 2687 sum5 -= v5[j+4] * tmp0; 2688 } 2689 2690 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2691 sum2 -= v2[0] * tmp0; 2692 sum3 -= v3[1] * tmp0; 2693 sum4 -= v4[2] * tmp0; 2694 sum5 -= v5[3] * tmp0; 2695 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2696 sum3 -= v3[0] * tmp0; 2697 sum4 -= v4[1] * tmp0; 2698 sum5 -= v5[2] * tmp0; 2699 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2700 sum4 -= v4[0] * tmp0; 2701 sum5 -= v5[1] * tmp0; 2702 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2703 sum5 -= v5[0] * tmp0; 2704 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2705 break; 2706 default: 2707 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2708 } 2709 } 2710 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2711 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2712 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2713 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2714 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2715 PetscFunctionReturn(0); 2716 } 2717 2718 2719 /* 2720 Makes a longer coloring[] array and calls the usual code with that 2721 */ 2722 #undef __FUNCT__ 2723 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2724 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2725 { 2726 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2727 PetscErrorCode ierr; 2728 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2729 PetscInt *colorused,i; 2730 ISColoringValue *newcolor; 2731 2732 PetscFunctionBegin; 2733 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2734 /* loop over inodes, marking a color for each column*/ 2735 row = 0; 2736 for (i=0; i<m; i++) { 2737 for (j=0; j<ns[i]; j++) { 2738 newcolor[row++] = coloring[i] + j*ncolors; 2739 } 2740 } 2741 2742 /* eliminate unneeded colors */ 2743 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2744 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2745 for (i=0; i<n; i++) { 2746 colorused[newcolor[i]] = 1; 2747 } 2748 2749 for (i=1; i<5*ncolors; i++) { 2750 colorused[i] += colorused[i-1]; 2751 } 2752 ncolors = colorused[5*ncolors-1]; 2753 for (i=0; i<n; i++) { 2754 newcolor[i] = colorused[newcolor[i]]-1; 2755 } 2756 ierr = PetscFree(colorused);CHKERRQ(ierr); 2757 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2758 ierr = PetscFree(coloring);CHKERRQ(ierr); 2759 PetscFunctionReturn(0); 2760 } 2761 2762 #include <petsc-private/kernels/blockinvert.h> 2763 2764 #undef __FUNCT__ 2765 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2766 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2767 { 2768 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2769 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2770 MatScalar *ibdiag,*bdiag,work[25],*t; 2771 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2772 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2773 const PetscScalar *xb, *b; 2774 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2775 PetscErrorCode ierr; 2776 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2777 PetscInt sz,k,ipvt[5]; 2778 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2779 2780 PetscFunctionBegin; 2781 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2782 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2783 2784 if (!a->inode.ibdiagvalid) { 2785 if (!a->inode.ibdiag) { 2786 /* calculate space needed for diagonal blocks */ 2787 for (i=0; i<m; i++) { 2788 cnt += sizes[i]*sizes[i]; 2789 } 2790 a->inode.bdiagsize = cnt; 2791 2792 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2793 } 2794 2795 /* copy over the diagonal blocks and invert them */ 2796 ibdiag = a->inode.ibdiag; 2797 bdiag = a->inode.bdiag; 2798 cnt = 0; 2799 for (i=0, row = 0; i<m; i++) { 2800 for (j=0; j<sizes[i]; j++) { 2801 for (k=0; k<sizes[i]; k++) { 2802 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2803 } 2804 } 2805 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2806 2807 switch (sizes[i]) { 2808 case 1: 2809 /* Create matrix data structure */ 2810 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2811 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2812 break; 2813 case 2: 2814 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2815 break; 2816 case 3: 2817 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2818 break; 2819 case 4: 2820 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2821 break; 2822 case 5: 2823 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2824 break; 2825 default: 2826 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2827 } 2828 cnt += sizes[i]*sizes[i]; 2829 row += sizes[i]; 2830 } 2831 a->inode.ibdiagvalid = PETSC_TRUE; 2832 } 2833 ibdiag = a->inode.ibdiag; 2834 bdiag = a->inode.bdiag; 2835 t = a->inode.ssor_work; 2836 2837 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2838 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2839 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2840 if (flag & SOR_ZERO_INITIAL_GUESS) { 2841 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2842 2843 for (i=0, row=0; i<m; i++) { 2844 sz = diag[row] - ii[row]; 2845 v1 = a->a + ii[row]; 2846 idx = a->j + ii[row]; 2847 2848 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2849 switch (sizes[i]) { 2850 case 1: 2851 2852 sum1 = b[row]; 2853 for (n = 0; n<sz-1; n+=2) { 2854 i1 = idx[0]; 2855 i2 = idx[1]; 2856 idx += 2; 2857 tmp0 = x[i1]; 2858 tmp1 = x[i2]; 2859 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2860 } 2861 2862 if (n == sz-1) { 2863 tmp0 = x[*idx]; 2864 sum1 -= *v1 * tmp0; 2865 } 2866 t[row] = sum1; 2867 x[row++] = sum1*(*ibdiag++); 2868 break; 2869 case 2: 2870 v2 = a->a + ii[row+1]; 2871 sum1 = b[row]; 2872 sum2 = b[row+1]; 2873 for (n = 0; n<sz-1; n+=2) { 2874 i1 = idx[0]; 2875 i2 = idx[1]; 2876 idx += 2; 2877 tmp0 = x[i1]; 2878 tmp1 = x[i2]; 2879 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2880 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2881 } 2882 2883 if (n == sz-1) { 2884 tmp0 = x[*idx]; 2885 sum1 -= v1[0] * tmp0; 2886 sum2 -= v2[0] * tmp0; 2887 } 2888 t[row] = sum1; 2889 t[row+1] = sum2; 2890 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2891 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2892 ibdiag += 4; 2893 break; 2894 case 3: 2895 v2 = a->a + ii[row+1]; 2896 v3 = a->a + ii[row+2]; 2897 sum1 = b[row]; 2898 sum2 = b[row+1]; 2899 sum3 = b[row+2]; 2900 for (n = 0; n<sz-1; n+=2) { 2901 i1 = idx[0]; 2902 i2 = idx[1]; 2903 idx += 2; 2904 tmp0 = x[i1]; 2905 tmp1 = x[i2]; 2906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2908 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2909 } 2910 2911 if (n == sz-1) { 2912 tmp0 = x[*idx]; 2913 sum1 -= v1[0] * tmp0; 2914 sum2 -= v2[0] * tmp0; 2915 sum3 -= v3[0] * tmp0; 2916 } 2917 t[row] = sum1; 2918 t[row+1] = sum2; 2919 t[row+2] = sum3; 2920 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2921 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2922 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2923 ibdiag += 9; 2924 break; 2925 case 4: 2926 v2 = a->a + ii[row+1]; 2927 v3 = a->a + ii[row+2]; 2928 v4 = a->a + ii[row+3]; 2929 sum1 = b[row]; 2930 sum2 = b[row+1]; 2931 sum3 = b[row+2]; 2932 sum4 = b[row+3]; 2933 for (n = 0; n<sz-1; n+=2) { 2934 i1 = idx[0]; 2935 i2 = idx[1]; 2936 idx += 2; 2937 tmp0 = x[i1]; 2938 tmp1 = x[i2]; 2939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2941 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2942 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2943 } 2944 2945 if (n == sz-1) { 2946 tmp0 = x[*idx]; 2947 sum1 -= v1[0] * tmp0; 2948 sum2 -= v2[0] * tmp0; 2949 sum3 -= v3[0] * tmp0; 2950 sum4 -= v4[0] * tmp0; 2951 } 2952 t[row] = sum1; 2953 t[row+1] = sum2; 2954 t[row+2] = sum3; 2955 t[row+3] = sum4; 2956 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2957 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2958 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2959 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2960 ibdiag += 16; 2961 break; 2962 case 5: 2963 v2 = a->a + ii[row+1]; 2964 v3 = a->a + ii[row+2]; 2965 v4 = a->a + ii[row+3]; 2966 v5 = a->a + ii[row+4]; 2967 sum1 = b[row]; 2968 sum2 = b[row+1]; 2969 sum3 = b[row+2]; 2970 sum4 = b[row+3]; 2971 sum5 = b[row+4]; 2972 for (n = 0; n<sz-1; n+=2) { 2973 i1 = idx[0]; 2974 i2 = idx[1]; 2975 idx += 2; 2976 tmp0 = x[i1]; 2977 tmp1 = x[i2]; 2978 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2979 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2980 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2981 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2982 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2983 } 2984 2985 if (n == sz-1) { 2986 tmp0 = x[*idx]; 2987 sum1 -= v1[0] * tmp0; 2988 sum2 -= v2[0] * tmp0; 2989 sum3 -= v3[0] * tmp0; 2990 sum4 -= v4[0] * tmp0; 2991 sum5 -= v5[0] * tmp0; 2992 } 2993 t[row] = sum1; 2994 t[row+1] = sum2; 2995 t[row+2] = sum3; 2996 t[row+3] = sum4; 2997 t[row+4] = sum5; 2998 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2999 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3000 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3001 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3002 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3003 ibdiag += 25; 3004 break; 3005 default: 3006 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3007 } 3008 } 3009 3010 xb = t; 3011 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3012 } else xb = b; 3013 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3014 3015 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3016 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3017 ibdiag -= sizes[i]*sizes[i]; 3018 sz = ii[row+1] - diag[row] - 1; 3019 v1 = a->a + diag[row] + 1; 3020 idx = a->j + diag[row] + 1; 3021 3022 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3023 switch (sizes[i]) { 3024 case 1: 3025 3026 sum1 = xb[row]; 3027 for (n = 0; n<sz-1; n+=2) { 3028 i1 = idx[0]; 3029 i2 = idx[1]; 3030 idx += 2; 3031 tmp0 = x[i1]; 3032 tmp1 = x[i2]; 3033 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3034 } 3035 3036 if (n == sz-1) { 3037 tmp0 = x[*idx]; 3038 sum1 -= *v1*tmp0; 3039 } 3040 x[row--] = sum1*(*ibdiag); 3041 break; 3042 3043 case 2: 3044 3045 sum1 = xb[row]; 3046 sum2 = xb[row-1]; 3047 /* note that sum1 is associated with the second of the two rows */ 3048 v2 = a->a + diag[row-1] + 2; 3049 for (n = 0; n<sz-1; n+=2) { 3050 i1 = idx[0]; 3051 i2 = idx[1]; 3052 idx += 2; 3053 tmp0 = x[i1]; 3054 tmp1 = x[i2]; 3055 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3056 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3057 } 3058 3059 if (n == sz-1) { 3060 tmp0 = x[*idx]; 3061 sum1 -= *v1*tmp0; 3062 sum2 -= *v2*tmp0; 3063 } 3064 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3065 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3066 break; 3067 case 3: 3068 3069 sum1 = xb[row]; 3070 sum2 = xb[row-1]; 3071 sum3 = xb[row-2]; 3072 v2 = a->a + diag[row-1] + 2; 3073 v3 = a->a + diag[row-2] + 3; 3074 for (n = 0; n<sz-1; n+=2) { 3075 i1 = idx[0]; 3076 i2 = idx[1]; 3077 idx += 2; 3078 tmp0 = x[i1]; 3079 tmp1 = x[i2]; 3080 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3081 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3082 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3083 } 3084 3085 if (n == sz-1) { 3086 tmp0 = x[*idx]; 3087 sum1 -= *v1*tmp0; 3088 sum2 -= *v2*tmp0; 3089 sum3 -= *v3*tmp0; 3090 } 3091 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3092 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3093 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3094 break; 3095 case 4: 3096 3097 sum1 = xb[row]; 3098 sum2 = xb[row-1]; 3099 sum3 = xb[row-2]; 3100 sum4 = xb[row-3]; 3101 v2 = a->a + diag[row-1] + 2; 3102 v3 = a->a + diag[row-2] + 3; 3103 v4 = a->a + diag[row-3] + 4; 3104 for (n = 0; n<sz-1; n+=2) { 3105 i1 = idx[0]; 3106 i2 = idx[1]; 3107 idx += 2; 3108 tmp0 = x[i1]; 3109 tmp1 = x[i2]; 3110 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3111 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3112 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3113 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3114 } 3115 3116 if (n == sz-1) { 3117 tmp0 = x[*idx]; 3118 sum1 -= *v1*tmp0; 3119 sum2 -= *v2*tmp0; 3120 sum3 -= *v3*tmp0; 3121 sum4 -= *v4*tmp0; 3122 } 3123 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3124 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3125 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3126 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3127 break; 3128 case 5: 3129 3130 sum1 = xb[row]; 3131 sum2 = xb[row-1]; 3132 sum3 = xb[row-2]; 3133 sum4 = xb[row-3]; 3134 sum5 = xb[row-4]; 3135 v2 = a->a + diag[row-1] + 2; 3136 v3 = a->a + diag[row-2] + 3; 3137 v4 = a->a + diag[row-3] + 4; 3138 v5 = a->a + diag[row-4] + 5; 3139 for (n = 0; n<sz-1; n+=2) { 3140 i1 = idx[0]; 3141 i2 = idx[1]; 3142 idx += 2; 3143 tmp0 = x[i1]; 3144 tmp1 = x[i2]; 3145 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3146 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3147 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3148 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3149 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3150 } 3151 3152 if (n == sz-1) { 3153 tmp0 = x[*idx]; 3154 sum1 -= *v1*tmp0; 3155 sum2 -= *v2*tmp0; 3156 sum3 -= *v3*tmp0; 3157 sum4 -= *v4*tmp0; 3158 sum5 -= *v5*tmp0; 3159 } 3160 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3161 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3162 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3163 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3164 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3165 break; 3166 default: 3167 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3168 } 3169 } 3170 3171 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3172 } 3173 its--; 3174 } 3175 while (its--) { 3176 3177 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3178 ibdiag = a->inode.ibdiag; 3179 3180 for (i=0, row=0; i<m; i++) { 3181 sz = ii[row + 1] - ii[row]; 3182 v1 = a->a + ii[row]; 3183 idx = a->j + ii[row]; 3184 3185 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3186 switch (sizes[i]) { 3187 case 1: 3188 3189 sum1 = b[row]; 3190 for (n = 0; n<sz-1; n+=2) { 3191 i1 = idx[0]; 3192 i2 = idx[1]; 3193 idx += 2; 3194 tmp0 = x[i1]; 3195 tmp1 = x[i2]; 3196 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3197 } 3198 3199 if (n == sz-1) { 3200 tmp0 = x[*idx]; 3201 sum1 -= *v1 * tmp0; 3202 } 3203 3204 /* in MatSOR_SeqAIJ this line would be 3205 * 3206 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3207 * 3208 * but omega == 1, so this becomes 3209 * 3210 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3211 * 3212 * but bdiag and ibdiag cancel each other, so we can change this 3213 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3214 * block sizes as well 3215 */ 3216 x[row++] += sum1*(*ibdiag++); 3217 break; 3218 case 2: 3219 v2 = a->a + ii[row+1]; 3220 sum1 = b[row]; 3221 sum2 = b[row+1]; 3222 for (n = 0; n<sz-1; n+=2) { 3223 i1 = idx[0]; 3224 i2 = idx[1]; 3225 idx += 2; 3226 tmp0 = x[i1]; 3227 tmp1 = x[i2]; 3228 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3229 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3230 } 3231 3232 if (n == sz-1) { 3233 tmp0 = x[*idx]; 3234 sum1 -= v1[0] * tmp0; 3235 sum2 -= v2[0] * tmp0; 3236 } 3237 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3238 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3239 ibdiag += 4; 3240 break; 3241 case 3: 3242 v2 = a->a + ii[row+1]; 3243 v3 = a->a + ii[row+2]; 3244 sum1 = b[row]; 3245 sum2 = b[row+1]; 3246 sum3 = b[row+2]; 3247 for (n = 0; n<sz-1; n+=2) { 3248 i1 = idx[0]; 3249 i2 = idx[1]; 3250 idx += 2; 3251 tmp0 = x[i1]; 3252 tmp1 = x[i2]; 3253 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3254 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3255 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3256 } 3257 3258 if (n == sz-1) { 3259 tmp0 = x[*idx]; 3260 sum1 -= v1[0] * tmp0; 3261 sum2 -= v2[0] * tmp0; 3262 sum3 -= v3[0] * tmp0; 3263 } 3264 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3265 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3266 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3267 ibdiag += 9; 3268 break; 3269 case 4: 3270 v2 = a->a + ii[row+1]; 3271 v3 = a->a + ii[row+2]; 3272 v4 = a->a + ii[row+3]; 3273 sum1 = b[row]; 3274 sum2 = b[row+1]; 3275 sum3 = b[row+2]; 3276 sum4 = b[row+3]; 3277 for (n = 0; n<sz-1; n+=2) { 3278 i1 = idx[0]; 3279 i2 = idx[1]; 3280 idx += 2; 3281 tmp0 = x[i1]; 3282 tmp1 = x[i2]; 3283 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3284 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3285 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3286 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3287 } 3288 3289 if (n == sz-1) { 3290 tmp0 = x[*idx]; 3291 sum1 -= v1[0] * tmp0; 3292 sum2 -= v2[0] * tmp0; 3293 sum3 -= v3[0] * tmp0; 3294 sum4 -= v4[0] * tmp0; 3295 } 3296 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3297 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3298 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3299 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3300 ibdiag += 16; 3301 break; 3302 case 5: 3303 v2 = a->a + ii[row+1]; 3304 v3 = a->a + ii[row+2]; 3305 v4 = a->a + ii[row+3]; 3306 v5 = a->a + ii[row+4]; 3307 sum1 = b[row]; 3308 sum2 = b[row+1]; 3309 sum3 = b[row+2]; 3310 sum4 = b[row+3]; 3311 sum5 = b[row+4]; 3312 for (n = 0; n<sz-1; n+=2) { 3313 i1 = idx[0]; 3314 i2 = idx[1]; 3315 idx += 2; 3316 tmp0 = x[i1]; 3317 tmp1 = x[i2]; 3318 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3319 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3320 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3321 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3322 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3323 } 3324 3325 if (n == sz-1) { 3326 tmp0 = x[*idx]; 3327 sum1 -= v1[0] * tmp0; 3328 sum2 -= v2[0] * tmp0; 3329 sum3 -= v3[0] * tmp0; 3330 sum4 -= v4[0] * tmp0; 3331 sum5 -= v5[0] * tmp0; 3332 } 3333 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3334 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3335 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3336 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3337 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3338 ibdiag += 25; 3339 break; 3340 default: 3341 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3342 } 3343 } 3344 3345 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3346 } 3347 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3348 3349 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3350 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3351 ibdiag -= sizes[i]*sizes[i]; 3352 sz = ii[row+1] - ii[row]; 3353 v1 = a->a + ii[row]; 3354 idx = a->j + ii[row]; 3355 3356 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3357 switch (sizes[i]) { 3358 case 1: 3359 3360 sum1 = b[row]; 3361 for (n = 0; n<sz-1; n+=2) { 3362 i1 = idx[0]; 3363 i2 = idx[1]; 3364 idx += 2; 3365 tmp0 = x[i1]; 3366 tmp1 = x[i2]; 3367 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3368 } 3369 3370 if (n == sz-1) { 3371 tmp0 = x[*idx]; 3372 sum1 -= *v1*tmp0; 3373 } 3374 x[row--] += sum1*(*ibdiag); 3375 break; 3376 3377 case 2: 3378 3379 sum1 = b[row]; 3380 sum2 = b[row-1]; 3381 /* note that sum1 is associated with the second of the two rows */ 3382 v2 = a->a + ii[row - 1]; 3383 for (n = 0; n<sz-1; n+=2) { 3384 i1 = idx[0]; 3385 i2 = idx[1]; 3386 idx += 2; 3387 tmp0 = x[i1]; 3388 tmp1 = x[i2]; 3389 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3390 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3391 } 3392 3393 if (n == sz-1) { 3394 tmp0 = x[*idx]; 3395 sum1 -= *v1*tmp0; 3396 sum2 -= *v2*tmp0; 3397 } 3398 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3399 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3400 break; 3401 case 3: 3402 3403 sum1 = b[row]; 3404 sum2 = b[row-1]; 3405 sum3 = b[row-2]; 3406 v2 = a->a + ii[row-1]; 3407 v3 = a->a + ii[row-2]; 3408 for (n = 0; n<sz-1; n+=2) { 3409 i1 = idx[0]; 3410 i2 = idx[1]; 3411 idx += 2; 3412 tmp0 = x[i1]; 3413 tmp1 = x[i2]; 3414 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3415 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3416 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3417 } 3418 3419 if (n == sz-1) { 3420 tmp0 = x[*idx]; 3421 sum1 -= *v1*tmp0; 3422 sum2 -= *v2*tmp0; 3423 sum3 -= *v3*tmp0; 3424 } 3425 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3426 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3427 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3428 break; 3429 case 4: 3430 3431 sum1 = b[row]; 3432 sum2 = b[row-1]; 3433 sum3 = b[row-2]; 3434 sum4 = b[row-3]; 3435 v2 = a->a + ii[row-1]; 3436 v3 = a->a + ii[row-2]; 3437 v4 = a->a + ii[row-3]; 3438 for (n = 0; n<sz-1; n+=2) { 3439 i1 = idx[0]; 3440 i2 = idx[1]; 3441 idx += 2; 3442 tmp0 = x[i1]; 3443 tmp1 = x[i2]; 3444 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3445 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3446 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3447 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3448 } 3449 3450 if (n == sz-1) { 3451 tmp0 = x[*idx]; 3452 sum1 -= *v1*tmp0; 3453 sum2 -= *v2*tmp0; 3454 sum3 -= *v3*tmp0; 3455 sum4 -= *v4*tmp0; 3456 } 3457 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3458 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3459 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3460 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3461 break; 3462 case 5: 3463 3464 sum1 = b[row]; 3465 sum2 = b[row-1]; 3466 sum3 = b[row-2]; 3467 sum4 = b[row-3]; 3468 sum5 = b[row-4]; 3469 v2 = a->a + ii[row-1]; 3470 v3 = a->a + ii[row-2]; 3471 v4 = a->a + ii[row-3]; 3472 v5 = a->a + ii[row-4]; 3473 for (n = 0; n<sz-1; n+=2) { 3474 i1 = idx[0]; 3475 i2 = idx[1]; 3476 idx += 2; 3477 tmp0 = x[i1]; 3478 tmp1 = x[i2]; 3479 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3480 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3481 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3482 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3483 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3484 } 3485 3486 if (n == sz-1) { 3487 tmp0 = x[*idx]; 3488 sum1 -= *v1*tmp0; 3489 sum2 -= *v2*tmp0; 3490 sum3 -= *v3*tmp0; 3491 sum4 -= *v4*tmp0; 3492 sum5 -= *v5*tmp0; 3493 } 3494 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3495 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3496 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3497 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3498 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3499 break; 3500 default: 3501 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3502 } 3503 } 3504 3505 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3506 } 3507 } 3508 if (flag & SOR_EISENSTAT) { 3509 /* 3510 Apply (U + D)^-1 where D is now the block diagonal 3511 */ 3512 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3513 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3514 ibdiag -= sizes[i]*sizes[i]; 3515 sz = ii[row+1] - diag[row] - 1; 3516 v1 = a->a + diag[row] + 1; 3517 idx = a->j + diag[row] + 1; 3518 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3519 switch (sizes[i]) { 3520 case 1: 3521 3522 sum1 = b[row]; 3523 for (n = 0; n<sz-1; n+=2) { 3524 i1 = idx[0]; 3525 i2 = idx[1]; 3526 idx += 2; 3527 tmp0 = x[i1]; 3528 tmp1 = x[i2]; 3529 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3530 } 3531 3532 if (n == sz-1) { 3533 tmp0 = x[*idx]; 3534 sum1 -= *v1*tmp0; 3535 } 3536 x[row] = sum1*(*ibdiag);row--; 3537 break; 3538 3539 case 2: 3540 3541 sum1 = b[row]; 3542 sum2 = b[row-1]; 3543 /* note that sum1 is associated with the second of the two rows */ 3544 v2 = a->a + diag[row-1] + 2; 3545 for (n = 0; n<sz-1; n+=2) { 3546 i1 = idx[0]; 3547 i2 = idx[1]; 3548 idx += 2; 3549 tmp0 = x[i1]; 3550 tmp1 = x[i2]; 3551 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3552 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3553 } 3554 3555 if (n == sz-1) { 3556 tmp0 = x[*idx]; 3557 sum1 -= *v1*tmp0; 3558 sum2 -= *v2*tmp0; 3559 } 3560 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3561 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3562 row -= 2; 3563 break; 3564 case 3: 3565 3566 sum1 = b[row]; 3567 sum2 = b[row-1]; 3568 sum3 = b[row-2]; 3569 v2 = a->a + diag[row-1] + 2; 3570 v3 = a->a + diag[row-2] + 3; 3571 for (n = 0; n<sz-1; n+=2) { 3572 i1 = idx[0]; 3573 i2 = idx[1]; 3574 idx += 2; 3575 tmp0 = x[i1]; 3576 tmp1 = x[i2]; 3577 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3578 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3579 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3580 } 3581 3582 if (n == sz-1) { 3583 tmp0 = x[*idx]; 3584 sum1 -= *v1*tmp0; 3585 sum2 -= *v2*tmp0; 3586 sum3 -= *v3*tmp0; 3587 } 3588 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3589 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3590 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3591 row -= 3; 3592 break; 3593 case 4: 3594 3595 sum1 = b[row]; 3596 sum2 = b[row-1]; 3597 sum3 = b[row-2]; 3598 sum4 = b[row-3]; 3599 v2 = a->a + diag[row-1] + 2; 3600 v3 = a->a + diag[row-2] + 3; 3601 v4 = a->a + diag[row-3] + 4; 3602 for (n = 0; n<sz-1; n+=2) { 3603 i1 = idx[0]; 3604 i2 = idx[1]; 3605 idx += 2; 3606 tmp0 = x[i1]; 3607 tmp1 = x[i2]; 3608 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3609 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3610 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3611 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3612 } 3613 3614 if (n == sz-1) { 3615 tmp0 = x[*idx]; 3616 sum1 -= *v1*tmp0; 3617 sum2 -= *v2*tmp0; 3618 sum3 -= *v3*tmp0; 3619 sum4 -= *v4*tmp0; 3620 } 3621 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3622 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3623 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3624 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3625 row -= 4; 3626 break; 3627 case 5: 3628 3629 sum1 = b[row]; 3630 sum2 = b[row-1]; 3631 sum3 = b[row-2]; 3632 sum4 = b[row-3]; 3633 sum5 = b[row-4]; 3634 v2 = a->a + diag[row-1] + 2; 3635 v3 = a->a + diag[row-2] + 3; 3636 v4 = a->a + diag[row-3] + 4; 3637 v5 = a->a + diag[row-4] + 5; 3638 for (n = 0; n<sz-1; n+=2) { 3639 i1 = idx[0]; 3640 i2 = idx[1]; 3641 idx += 2; 3642 tmp0 = x[i1]; 3643 tmp1 = x[i2]; 3644 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3645 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3646 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3647 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3648 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3649 } 3650 3651 if (n == sz-1) { 3652 tmp0 = x[*idx]; 3653 sum1 -= *v1*tmp0; 3654 sum2 -= *v2*tmp0; 3655 sum3 -= *v3*tmp0; 3656 sum4 -= *v4*tmp0; 3657 sum5 -= *v5*tmp0; 3658 } 3659 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3660 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3661 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3662 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3663 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3664 row -= 5; 3665 break; 3666 default: 3667 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3668 } 3669 } 3670 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3671 3672 /* 3673 t = b - D x where D is the block diagonal 3674 */ 3675 cnt = 0; 3676 for (i=0, row=0; i<m; i++) { 3677 switch (sizes[i]) { 3678 case 1: 3679 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3680 break; 3681 case 2: 3682 x1 = x[row]; x2 = x[row+1]; 3683 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3684 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3685 t[row] = b[row] - tmp1; 3686 t[row+1] = b[row+1] - tmp2; row += 2; 3687 cnt += 4; 3688 break; 3689 case 3: 3690 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3691 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3692 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3693 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3694 t[row] = b[row] - tmp1; 3695 t[row+1] = b[row+1] - tmp2; 3696 t[row+2] = b[row+2] - tmp3; row += 3; 3697 cnt += 9; 3698 break; 3699 case 4: 3700 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3701 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3702 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3703 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3704 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3705 t[row] = b[row] - tmp1; 3706 t[row+1] = b[row+1] - tmp2; 3707 t[row+2] = b[row+2] - tmp3; 3708 t[row+3] = b[row+3] - tmp4; row += 4; 3709 cnt += 16; 3710 break; 3711 case 5: 3712 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3713 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3714 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3715 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3716 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3717 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3718 t[row] = b[row] - tmp1; 3719 t[row+1] = b[row+1] - tmp2; 3720 t[row+2] = b[row+2] - tmp3; 3721 t[row+3] = b[row+3] - tmp4; 3722 t[row+4] = b[row+4] - tmp5;row += 5; 3723 cnt += 25; 3724 break; 3725 default: 3726 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3727 } 3728 } 3729 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3730 3731 3732 3733 /* 3734 Apply (L + D)^-1 where D is the block diagonal 3735 */ 3736 for (i=0, row=0; i<m; i++) { 3737 sz = diag[row] - ii[row]; 3738 v1 = a->a + ii[row]; 3739 idx = a->j + ii[row]; 3740 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3741 switch (sizes[i]) { 3742 case 1: 3743 3744 sum1 = t[row]; 3745 for (n = 0; n<sz-1; n+=2) { 3746 i1 = idx[0]; 3747 i2 = idx[1]; 3748 idx += 2; 3749 tmp0 = t[i1]; 3750 tmp1 = t[i2]; 3751 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3752 } 3753 3754 if (n == sz-1) { 3755 tmp0 = t[*idx]; 3756 sum1 -= *v1 * tmp0; 3757 } 3758 x[row] += t[row] = sum1*(*ibdiag++); row++; 3759 break; 3760 case 2: 3761 v2 = a->a + ii[row+1]; 3762 sum1 = t[row]; 3763 sum2 = t[row+1]; 3764 for (n = 0; n<sz-1; n+=2) { 3765 i1 = idx[0]; 3766 i2 = idx[1]; 3767 idx += 2; 3768 tmp0 = t[i1]; 3769 tmp1 = t[i2]; 3770 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3771 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3772 } 3773 3774 if (n == sz-1) { 3775 tmp0 = t[*idx]; 3776 sum1 -= v1[0] * tmp0; 3777 sum2 -= v2[0] * tmp0; 3778 } 3779 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3780 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3781 ibdiag += 4; row += 2; 3782 break; 3783 case 3: 3784 v2 = a->a + ii[row+1]; 3785 v3 = a->a + ii[row+2]; 3786 sum1 = t[row]; 3787 sum2 = t[row+1]; 3788 sum3 = t[row+2]; 3789 for (n = 0; n<sz-1; n+=2) { 3790 i1 = idx[0]; 3791 i2 = idx[1]; 3792 idx += 2; 3793 tmp0 = t[i1]; 3794 tmp1 = t[i2]; 3795 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3796 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3797 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3798 } 3799 3800 if (n == sz-1) { 3801 tmp0 = t[*idx]; 3802 sum1 -= v1[0] * tmp0; 3803 sum2 -= v2[0] * tmp0; 3804 sum3 -= v3[0] * tmp0; 3805 } 3806 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3807 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3808 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3809 ibdiag += 9; row += 3; 3810 break; 3811 case 4: 3812 v2 = a->a + ii[row+1]; 3813 v3 = a->a + ii[row+2]; 3814 v4 = a->a + ii[row+3]; 3815 sum1 = t[row]; 3816 sum2 = t[row+1]; 3817 sum3 = t[row+2]; 3818 sum4 = t[row+3]; 3819 for (n = 0; n<sz-1; n+=2) { 3820 i1 = idx[0]; 3821 i2 = idx[1]; 3822 idx += 2; 3823 tmp0 = t[i1]; 3824 tmp1 = t[i2]; 3825 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3826 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3827 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3828 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3829 } 3830 3831 if (n == sz-1) { 3832 tmp0 = t[*idx]; 3833 sum1 -= v1[0] * tmp0; 3834 sum2 -= v2[0] * tmp0; 3835 sum3 -= v3[0] * tmp0; 3836 sum4 -= v4[0] * tmp0; 3837 } 3838 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3839 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3840 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3841 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3842 ibdiag += 16; row += 4; 3843 break; 3844 case 5: 3845 v2 = a->a + ii[row+1]; 3846 v3 = a->a + ii[row+2]; 3847 v4 = a->a + ii[row+3]; 3848 v5 = a->a + ii[row+4]; 3849 sum1 = t[row]; 3850 sum2 = t[row+1]; 3851 sum3 = t[row+2]; 3852 sum4 = t[row+3]; 3853 sum5 = t[row+4]; 3854 for (n = 0; n<sz-1; n+=2) { 3855 i1 = idx[0]; 3856 i2 = idx[1]; 3857 idx += 2; 3858 tmp0 = t[i1]; 3859 tmp1 = t[i2]; 3860 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3861 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3862 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3863 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3864 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3865 } 3866 3867 if (n == sz-1) { 3868 tmp0 = t[*idx]; 3869 sum1 -= v1[0] * tmp0; 3870 sum2 -= v2[0] * tmp0; 3871 sum3 -= v3[0] * tmp0; 3872 sum4 -= v4[0] * tmp0; 3873 sum5 -= v5[0] * tmp0; 3874 } 3875 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3876 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3877 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3878 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3879 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3880 ibdiag += 25; row += 5; 3881 break; 3882 default: 3883 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3884 } 3885 } 3886 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3887 } 3888 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3889 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3890 PetscFunctionReturn(0); 3891 } 3892 3893 #undef __FUNCT__ 3894 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3895 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3896 { 3897 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3898 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3899 const MatScalar *bdiag = a->inode.bdiag; 3900 const PetscScalar *b; 3901 PetscErrorCode ierr; 3902 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3903 const PetscInt *sizes = a->inode.size; 3904 3905 PetscFunctionBegin; 3906 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3907 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3908 cnt = 0; 3909 for (i=0, row=0; i<m; i++) { 3910 switch (sizes[i]) { 3911 case 1: 3912 x[row] = b[row]*bdiag[cnt++];row++; 3913 break; 3914 case 2: 3915 x1 = b[row]; x2 = b[row+1]; 3916 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3917 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3918 x[row++] = tmp1; 3919 x[row++] = tmp2; 3920 cnt += 4; 3921 break; 3922 case 3: 3923 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3924 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3925 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3926 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3927 x[row++] = tmp1; 3928 x[row++] = tmp2; 3929 x[row++] = tmp3; 3930 cnt += 9; 3931 break; 3932 case 4: 3933 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3934 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3935 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3936 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3937 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3938 x[row++] = tmp1; 3939 x[row++] = tmp2; 3940 x[row++] = tmp3; 3941 x[row++] = tmp4; 3942 cnt += 16; 3943 break; 3944 case 5: 3945 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3946 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3947 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3948 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3949 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3950 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3951 x[row++] = tmp1; 3952 x[row++] = tmp2; 3953 x[row++] = tmp3; 3954 x[row++] = tmp4; 3955 x[row++] = tmp5; 3956 cnt += 25; 3957 break; 3958 default: 3959 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3960 } 3961 } 3962 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3963 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3964 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3965 PetscFunctionReturn(0); 3966 } 3967 3968 /* 3969 samestructure indicates that the matrix has not changed its nonzero structure so we 3970 do not need to recompute the inodes 3971 */ 3972 #undef __FUNCT__ 3973 #define __FUNCT__ "Mat_CheckInode" 3974 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3975 { 3976 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3977 PetscErrorCode ierr; 3978 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 3979 PetscBool flag; 3980 const PetscInt *idx,*idy,*ii; 3981 3982 PetscFunctionBegin; 3983 if (!a->inode.use) PetscFunctionReturn(0); 3984 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3985 3986 3987 m = A->rmap->n; 3988 if (a->inode.size) ns = a->inode.size; 3989 else { 3990 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 3991 } 3992 3993 i = 0; 3994 node_count = 0; 3995 idx = a->j; 3996 ii = a->i; 3997 while (i < m) { /* For each row */ 3998 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3999 /* Limits the number of elements in a node to 'a->inode.limit' */ 4000 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4001 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4002 if (nzy != nzx) break; 4003 idy += nzx; /* Same nonzero pattern */ 4004 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4005 if (!flag) break; 4006 } 4007 ns[node_count++] = blk_size; 4008 idx += blk_size*nzx; 4009 i = j; 4010 } 4011 /* If not enough inodes found,, do not use inode version of the routines */ 4012 if (!m || node_count > .8*m) { 4013 ierr = PetscFree(ns);CHKERRQ(ierr); 4014 4015 a->inode.node_count = 0; 4016 a->inode.size = NULL; 4017 a->inode.use = PETSC_FALSE; 4018 A->ops->mult = MatMult_SeqAIJ; 4019 A->ops->sor = MatSOR_SeqAIJ; 4020 A->ops->multadd = MatMultAdd_SeqAIJ; 4021 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4022 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4023 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4024 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4025 A->ops->coloringpatch = 0; 4026 A->ops->multdiagonalblock = 0; 4027 4028 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4029 } else { 4030 if (!A->factortype) { 4031 A->ops->mult = MatMult_SeqAIJ_Inode; 4032 A->ops->sor = MatSOR_SeqAIJ_Inode; 4033 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4034 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4035 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4036 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4037 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4038 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4039 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4040 } else { 4041 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4042 } 4043 a->inode.node_count = node_count; 4044 a->inode.size = ns; 4045 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4046 } 4047 a->inode.checked = PETSC_TRUE; 4048 PetscFunctionReturn(0); 4049 } 4050 4051 #undef __FUNCT__ 4052 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4053 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4054 { 4055 Mat B =*C; 4056 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4057 PetscErrorCode ierr; 4058 PetscInt m=A->rmap->n; 4059 4060 PetscFunctionBegin; 4061 c->inode.use = a->inode.use; 4062 c->inode.limit = a->inode.limit; 4063 c->inode.max_limit = a->inode.max_limit; 4064 if (a->inode.size) { 4065 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4066 c->inode.node_count = a->inode.node_count; 4067 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4068 /* note the table of functions below should match that in Mat_CheckInode() */ 4069 if (!B->factortype) { 4070 B->ops->mult = MatMult_SeqAIJ_Inode; 4071 B->ops->sor = MatSOR_SeqAIJ_Inode; 4072 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4073 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4074 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4075 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4076 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4077 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4078 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4079 } else { 4080 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4081 } 4082 } else { 4083 c->inode.size = 0; 4084 c->inode.node_count = 0; 4085 } 4086 c->inode.ibdiagvalid = PETSC_FALSE; 4087 c->inode.ibdiag = 0; 4088 c->inode.bdiag = 0; 4089 PetscFunctionReturn(0); 4090 } 4091 4092 #undef __FUNCT__ 4093 #define __FUNCT__ "MatGetRow_FactoredLU" 4094 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4095 { 4096 PetscInt k; 4097 const PetscInt *vi; 4098 4099 PetscFunctionBegin; 4100 vi = aj + ai[row]; 4101 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4102 vi = aj + adiag[row]; 4103 cols[nzl] = vi[0]; 4104 vi = aj + adiag[row+1]+1; 4105 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4106 PetscFunctionReturn(0); 4107 } 4108 /* 4109 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4110 Modified from Mat_CheckInode(). 4111 4112 Input Parameters: 4113 . Mat A - ILU or LU matrix factor 4114 4115 */ 4116 #undef __FUNCT__ 4117 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4118 PetscErrorCode Mat_CheckInode_FactorLU(Mat A) 4119 { 4120 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4121 PetscErrorCode ierr; 4122 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4123 PetscInt *cols1,*cols2,*ns; 4124 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4125 PetscBool flag; 4126 4127 PetscFunctionBegin; 4128 if (!a->inode.use) PetscFunctionReturn(0); 4129 if (a->inode.checked) PetscFunctionReturn(0); 4130 4131 m = A->rmap->n; 4132 if (a->inode.size) ns = a->inode.size; 4133 else { 4134 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 4135 } 4136 4137 i = 0; 4138 node_count = 0; 4139 ierr = PetscMalloc2(m,PetscInt,&cols1,m,PetscInt,&cols2);CHKERRQ(ierr); 4140 while (i < m) { /* For each row */ 4141 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4142 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4143 nzx = nzl1 + nzu1 + 1; 4144 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4145 4146 /* Limits the number of elements in a node to 'a->inode.limit' */ 4147 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4148 nzl2 = ai[j+1] - ai[j]; 4149 nzu2 = adiag[j] - adiag[j+1] - 1; 4150 nzy = nzl2 + nzu2 + 1; 4151 if (nzy != nzx) break; 4152 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4153 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4154 if (!flag) break; 4155 } 4156 ns[node_count++] = blk_size; 4157 i = j; 4158 } 4159 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4160 /* If not enough inodes found,, do not use inode version of the routines */ 4161 if (!m || node_count > .8*m) { 4162 ierr = PetscFree(ns);CHKERRQ(ierr); 4163 4164 a->inode.node_count = 0; 4165 a->inode.size = NULL; 4166 a->inode.use = PETSC_FALSE; 4167 4168 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4169 } else { 4170 A->ops->mult = 0; 4171 A->ops->sor = 0; 4172 A->ops->multadd = 0; 4173 A->ops->getrowij = 0; 4174 A->ops->restorerowij = 0; 4175 A->ops->getcolumnij = 0; 4176 A->ops->restorecolumnij = 0; 4177 A->ops->coloringpatch = 0; 4178 A->ops->multdiagonalblock = 0; 4179 a->inode.node_count = node_count; 4180 a->inode.size = ns; 4181 4182 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4183 } 4184 a->inode.checked = PETSC_TRUE; 4185 PetscFunctionReturn(0); 4186 } 4187 4188 #undef __FUNCT__ 4189 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4190 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4191 { 4192 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4193 4194 PetscFunctionBegin; 4195 a->inode.ibdiagvalid = PETSC_FALSE; 4196 PetscFunctionReturn(0); 4197 } 4198 4199 /* 4200 This is really ugly. if inodes are used this replaces the 4201 permutations with ones that correspond to rows/cols of the matrix 4202 rather then inode blocks 4203 */ 4204 #undef __FUNCT__ 4205 #define __FUNCT__ "MatInodeAdjustForInodes" 4206 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4207 { 4208 PetscErrorCode ierr; 4209 4210 PetscFunctionBegin; 4211 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4212 PetscFunctionReturn(0); 4213 } 4214 4215 #undef __FUNCT__ 4216 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4217 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4218 { 4219 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4220 PetscErrorCode ierr; 4221 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4222 const PetscInt *ridx,*cidx; 4223 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4224 PetscInt nslim_col,*ns_col; 4225 IS ris = *rperm,cis = *cperm; 4226 4227 PetscFunctionBegin; 4228 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4229 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4230 4231 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4232 ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4233 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4234 4235 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4236 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4237 4238 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4239 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4240 4241 /* Construct the permutations for rows*/ 4242 for (i=0,row = 0; i<nslim_row; ++i) { 4243 indx = ridx[i]; 4244 start_val = tns[indx]; 4245 end_val = tns[indx + 1]; 4246 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4247 } 4248 4249 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4250 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4251 4252 /* Construct permutations for columns */ 4253 for (i=0,col=0; i<nslim_col; ++i) { 4254 indx = cidx[i]; 4255 start_val = tns[indx]; 4256 end_val = tns[indx + 1]; 4257 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4258 } 4259 4260 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4261 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4262 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4263 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4264 4265 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4266 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4267 4268 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4269 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4270 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4271 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4272 ierr = PetscFree(tns);CHKERRQ(ierr); 4273 PetscFunctionReturn(0); 4274 } 4275 4276 #undef __FUNCT__ 4277 #define __FUNCT__ "MatInodeGetInodeSizes" 4278 /*@C 4279 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4280 4281 Not Collective 4282 4283 Input Parameter: 4284 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4285 4286 Output Parameter: 4287 + node_count - no of inodes present in the matrix. 4288 . sizes - an array of size node_count,with sizes of each inode. 4289 - limit - the max size used to generate the inodes. 4290 4291 Level: advanced 4292 4293 Notes: This routine returns some internal storage information 4294 of the matrix, it is intended to be used by advanced users. 4295 It should be called after the matrix is assembled. 4296 The contents of the sizes[] array should not be changed. 4297 NULL may be passed for information not requested. 4298 4299 .keywords: matrix, seqaij, get, inode 4300 4301 .seealso: MatGetInfo() 4302 @*/ 4303 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4304 { 4305 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4306 4307 PetscFunctionBegin; 4308 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4309 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4310 if (f) { 4311 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4312 } 4313 PetscFunctionReturn(0); 4314 } 4315 4316 #undef __FUNCT__ 4317 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4318 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4319 { 4320 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4321 4322 PetscFunctionBegin; 4323 if (node_count) *node_count = a->inode.node_count; 4324 if (sizes) *sizes = a->inode.size; 4325 if (limit) *limit = a->inode.limit; 4326 PetscFunctionReturn(0); 4327 } 4328