1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3; 1195 FactorShiftCtx sctx; 1196 PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,*ns,col; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp2 = rtmp1 + n; 1231 rtmp3 = rtmp2 + n; 1232 ics = ic; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 if (!ns){ 1237 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1238 } 1239 1240 /* If max inode size > 3, split it into two inodes.*/ 1241 /* also map the inode sizes according to the ordering */ 1242 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1243 for (i=0,j=0; i<node_max; ++i,++j){ 1244 if (ns[i]>3) { 1245 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1246 ++j; 1247 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1248 } else { 1249 tmp_vec1[j] = ns[i]; 1250 } 1251 } 1252 /* Use the correct node_max */ 1253 node_max = j; 1254 1255 /* Now reorder the inode info based on mat re-ordering info */ 1256 /* First create a row -> inode_size_array_index map */ 1257 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1258 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1259 for (i=0,row=0; i<node_max; i++) { 1260 nodesz = tmp_vec1[i]; 1261 for (j=0; j<nodesz; j++,row++) { 1262 nsmap[row] = i; 1263 } 1264 } 1265 /* Using nsmap, create a reordered ns structure */ 1266 for (i=0,j=0; i< node_max; i++) { 1267 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1268 tmp_vec2[i] = nodesz; 1269 j += nodesz; 1270 } 1271 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1272 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1273 1274 /* Now use the correct ns */ 1275 ns = tmp_vec2; 1276 1277 do { 1278 sctx.useshift = PETSC_FALSE; 1279 /* Now loop over each block-row, and do the factorization */ 1280 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1281 nodesz = ns[inod]; 1282 1283 switch (nodesz){ 1284 case 1: 1285 /*----------*/ 1286 /* zero rtmp1 */ 1287 /* L part */ 1288 nz = bi[i+1] - bi[i]; 1289 bjtmp = bj + bi[i]; 1290 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1291 1292 /* U part */ 1293 nz = bdiag[i]-bdiag[i+1]; 1294 bjtmp = bj + bdiag[i+1]+1; 1295 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1296 1297 /* load in initial (unfactored row) */ 1298 nz = ai[r[i]+1] - ai[r[i]]; 1299 ajtmp = aj + ai[r[i]]; 1300 v = aa + ai[r[i]]; 1301 for (j=0; j<nz; j++) { 1302 rtmp1[ics[ajtmp[j]]] = v[j]; 1303 } 1304 /* ZeropivotApply() */ 1305 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1306 1307 /* elimination */ 1308 bjtmp = bj + bi[i]; 1309 row = *bjtmp++; 1310 nzL = bi[i+1] - bi[i]; 1311 for(k=0; k < nzL;k++) { 1312 pc = rtmp1 + row; 1313 if (*pc != 0.0) { 1314 pv = b->a + bdiag[row]; 1315 mul1 = *pc * (*pv); 1316 *pc = mul1; 1317 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1318 pv = b->a + bdiag[row+1]+1; 1319 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1320 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1321 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1322 } 1323 row = *bjtmp++; 1324 } 1325 1326 /* finished row so stick it into b->a */ 1327 rs = 0.0; 1328 /* L part */ 1329 pv = b->a + bi[i] ; 1330 pj = b->j + bi[i] ; 1331 nz = bi[i+1] - bi[i]; 1332 for (j=0; j<nz; j++) { 1333 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1334 } 1335 1336 /* U part */ 1337 pv = b->a + bdiag[i+1]+1; 1338 pj = b->j + bdiag[i+1]+1; 1339 nz = bdiag[i] - bdiag[i+1]-1; 1340 for (j=0; j<nz; j++) { 1341 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* Check zero pivot */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp1[i]; 1347 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1348 1349 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1350 pv = b->a + bdiag[i]; 1351 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1352 break; 1353 1354 case 2: 1355 /*----------*/ 1356 /* zero rtmp1 and rtmp2 */ 1357 /* L part */ 1358 nz = bi[i+1] - bi[i]; 1359 bjtmp = bj + bi[i]; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* U part */ 1366 nz = bdiag[i]-bdiag[i+1]; 1367 bjtmp = bj + bdiag[i+1]+1; 1368 for (j=0; j<nz; j++) { 1369 col = bjtmp[j]; 1370 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1371 } 1372 1373 /* load in initial (unfactored row) */ 1374 nz = ai[r[i]+1] - ai[r[i]]; 1375 ajtmp = aj + ai[r[i]]; 1376 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1377 for (j=0; j<nz; j++) { 1378 col = ics[ajtmp[j]]; 1379 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1380 } 1381 /* ZeropivotApply(): shift the diagonal of the matrix */ 1382 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1383 1384 /* elimination */ 1385 bjtmp = bj + bi[i]; 1386 row = *bjtmp++; /* pivot row */ 1387 nzL = bi[i+1] - bi[i]; 1388 for(k=0; k < nzL;k++) { 1389 pc1 = rtmp1 + row; 1390 pc2 = rtmp2 + row; 1391 if (*pc1 != 0.0 || *pc2 != 0.0) { 1392 pv = b->a + bdiag[row]; 1393 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1394 *pc1 = mul1; *pc2 = mul2; 1395 1396 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1397 pv = b->a + bdiag[row+1]+1; 1398 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1399 for (j=0; j<nz; j++){ 1400 col = pj[j]; 1401 rtmp1[col] -= mul1 * pv[j]; 1402 rtmp2[col] -= mul2 * pv[j]; 1403 } 1404 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1405 } 1406 row = *bjtmp++; 1407 } 1408 1409 /* finished row i; check zero pivot, then stick row i into b->a */ 1410 rs = 0.0; 1411 /* L part */ 1412 k = 0; 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i] ; 1415 nz = bi[i+1] - bi[i]; 1416 for (j=0; j<nz; j++) { 1417 col = pj[j]; 1418 if (col < i){pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);} 1419 } 1420 /* U part */ 1421 pc1 = b->a + bdiag[i+1]+1; 1422 pj = b->j + bdiag[i+1]+1; 1423 nz = bdiag[i] - bdiag[i+1]; /* include diagonal */ 1424 k = 0; 1425 for (j=0; j<nz; j++) { 1426 col = pj[j]; 1427 if (col > i) {pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);} 1428 } 1429 1430 sctx.rs = rs; 1431 sctx.pv = rtmp1[i]; 1432 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1433 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1434 *pc1 = 1.0/sctx.pv; 1435 1436 /* Now take care of diagonal 2x2 block. */ 1437 pc2 = rtmp2 + i; 1438 if (*pc2 != 0.0){ 1439 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1440 *pc2 = mul1; /* insert L entry */ 1441 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1442 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1443 for (j=0; j<nz; j++) { 1444 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1445 } 1446 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1447 } 1448 1449 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1450 rs = 0.0; 1451 /* L part */ 1452 pc2 = b->a + bi[i+1]; 1453 pj = b->j + bi[i+1] ; 1454 nz = bi[i+2] - bi[i+1]; 1455 k = 0; 1456 for (j=0; j<nz; j++) { 1457 col = pj[j]; 1458 if (col < i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);} 1459 } 1460 /* U part */ 1461 pc2 = b->a + bdiag[i+2]+1; 1462 pj = b->j + bdiag[i+1]+1; 1463 nz = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */ 1464 k =0; 1465 for (j=0; j<nz; j++) { 1466 col = pj[j]; 1467 if (col > i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);} 1468 } 1469 1470 sctx.rs = rs; 1471 sctx.pv = rtmp2[i+1]; 1472 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1473 pc2 = b->a + bdiag[i+1]; 1474 *pc2 = 1.0/sctx.pv; 1475 break; 1476 1477 case 3: 1478 /*----------*/ 1479 /* zero rtmp */ 1480 /* L part */ 1481 nz = bi[i+1] - bi[i]; 1482 bjtmp = bj + bi[i]; 1483 for (j=0; j<nz; j++) { 1484 col = bjtmp[j]; 1485 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1486 } 1487 1488 /* U part */ 1489 nz = bdiag[i]-bdiag[i+1]; 1490 bjtmp = bj + bdiag[i+1]+1; 1491 for (j=0; j<nz; j++) { 1492 col = bjtmp[j]; 1493 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1494 } 1495 1496 /* load in initial (unfactored row) */ 1497 nz = ai[r[i]+1] - ai[r[i]]; 1498 ajtmp = aj + ai[r[i]]; 1499 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1500 for (j=0; j<nz; j++) { 1501 col = ics[ajtmp[j]]; 1502 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1503 } 1504 /* ZeropivotApply(): shift the diagonal of the matrix */ 1505 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1506 1507 /* elimination */ 1508 bjtmp = bj + bi[i]; 1509 row = *bjtmp++; /* pivot row */ 1510 nzL = bi[i+1] - bi[i]; 1511 for(k=0; k < nzL;k++) { 1512 pc1 = rtmp1 + row; 1513 pc2 = rtmp2 + row; 1514 pc3 = rtmp3 + row; 1515 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1516 pv = b->a + bdiag[row]; 1517 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1518 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1519 1520 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1521 pv = b->a + bdiag[row+1]+1; 1522 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1523 for (j=0; j<nz; j++){ 1524 col = pj[j]; 1525 rtmp1[col] -= mul1 * pv[j]; 1526 rtmp2[col] -= mul2 * pv[j]; 1527 rtmp3[col] -= mul3 * pv[j]; 1528 } 1529 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1530 } 1531 row = *bjtmp++; 1532 } 1533 1534 /* finished row i; check zero pivot, then stick row i into b->a */ 1535 rs = 0.0; 1536 /* L part */ 1537 pc1 = b->a + bi[i]; 1538 pj = b->j + bi[i] ; 1539 nz = bi[i+1] - bi[i]; 1540 k = 0; 1541 for (j=0; j<nz; j++) { 1542 col = pj[j]; 1543 if (col < i){pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);} 1544 } 1545 /* U part */ 1546 pc1 = b->a + bdiag[i+1]+1; 1547 pj = b->j + bdiag[i+1]+1; 1548 nz = bdiag[i] - bdiag[i+1]; /* include diagonal */ 1549 k = 0; 1550 for (j=0; j<nz; j++) { 1551 col = pj[j]; 1552 if (col > i){pc1[k] = rtmp1[col]; k++; rs += PetscAbsScalar(pc1[k]);} 1553 } 1554 1555 sctx.rs = rs; 1556 sctx.pv = rtmp1[i]; 1557 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1558 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1559 *pc1 = 1.0/sctx.pv; 1560 1561 /* Now take care of 1st column of diagonal 3x3 block. */ 1562 pc2 = rtmp2 + i; 1563 pc3 = rtmp3 + i; 1564 if (*pc2 != 0.0 || *pc3 != 0.0){ 1565 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1566 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1567 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1568 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1569 for (j=0; j<nz; j++) { 1570 col = pj[j]; 1571 rtmp2[col] -= mul2 * rtmp1[col]; 1572 rtmp3[col] -= mul3 * rtmp1[col]; 1573 } 1574 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1575 } 1576 1577 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1578 rs = 0.0; 1579 /* L part */ 1580 pc2 = b->a + bi[i+1]; 1581 pj = b->j + bi[i+1] ; 1582 nz = bi[i+2] - bi[i+1]; 1583 k = 0; 1584 for (j=0; j<nz; j++) { 1585 col = pj[j]; 1586 if (col < i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);} 1587 } 1588 /* U part */ 1589 pc2 = b->a + bdiag[i+2]+1; 1590 pj = b->j + bdiag[i+1]+1; 1591 nz = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */ 1592 k = 0; 1593 for (j=0; j<nz; j++) { 1594 col = pj[j]; 1595 if (col > i+1){pc2[k] = rtmp2[col]; k++; rs += PetscAbsScalar(pc2[k]);} 1596 } 1597 1598 sctx.rs = rs; 1599 sctx.pv = rtmp2[i+1]; 1600 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1601 pc2 = b->a + bdiag[i+1]; 1602 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1603 1604 /* Now take care of 2nd column of diagonal 3x3 block. */ 1605 pc3 = rtmp3 + i+1; 1606 if (*pc3 != 0.0){ 1607 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1608 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1609 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1610 for (j=0; j<nz; j++) { 1611 col = pj[j]; 1612 rtmp3[col] -= mul3 * rtmp2[col]; 1613 } 1614 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1615 } 1616 1617 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1618 rs = 0.0; 1619 /* L part */ 1620 pc3 = b->a + bi[i+2]; 1621 pj = b->j + bi[i+2] ; 1622 nz = bi[i+3] - bi[i+2]; 1623 k = 0; 1624 for (j=0; j<nz; j++) { 1625 col = pj[j]; 1626 if (col < i+2){pc3[k] = rtmp3[col]; k++; rs += PetscAbsScalar(pc3[k]);} 1627 } 1628 /* U part */ 1629 pc3 = b->a + bdiag[i+3]+1; 1630 pj = b->j + bdiag[i+1]+1; 1631 nz = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */ 1632 k = 0; 1633 for (j=0; j<nz; j++) { 1634 col = pj[j]; 1635 if (col > i+2){pc3[k] = rtmp3[col]; k++; rs += PetscAbsScalar(pc3[k]);} 1636 } 1637 1638 sctx.rs = rs; 1639 sctx.pv = rtmp3[i+2]; 1640 ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr); 1641 pc3 = b->a + bdiag[i+2]; 1642 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1643 break; 1644 1645 default: 1646 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1647 } 1648 i += nodesz; /* Update the row */ 1649 } 1650 1651 /* MatPivotRefine() */ 1652 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1653 /* 1654 * if no shift in this attempt & shifting & started shifting & can refine, 1655 * then try lower shift 1656 */ 1657 sctx.shift_hi = sctx.shift_fraction; 1658 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1659 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1660 sctx.useshift = PETSC_TRUE; 1661 sctx.nshift++; 1662 } 1663 } while (sctx.useshift); 1664 1665 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1666 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1667 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1668 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1669 1670 C->ops->solve = MatSolve_SeqAIJ_Inode; 1671 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1672 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1673 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1674 C->ops->matsolve = MatMatSolve_SeqAIJ; 1675 C->assembled = PETSC_TRUE; 1676 C->preallocated = PETSC_TRUE; 1677 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1678 1679 /* MatShiftView(A,info,&sctx) */ 1680 if (sctx.nshift){ 1681 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1682 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1683 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1684 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1685 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1686 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1687 } 1688 } 1689 PetscFunctionReturn(0); 1690 } 1691 1692 #undef __FUNCT__ 1693 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1694 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1695 { 1696 Mat C = B; 1697 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1698 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1699 PetscErrorCode ierr; 1700 const PetscInt *r,*ic,*c,*ics; 1701 PetscInt n = A->rmap->n,*bi = b->i; 1702 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1703 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1704 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1705 PetscScalar mul1,mul2,mul3,tmp; 1706 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1707 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1708 PetscReal rs=0.0; 1709 FactorShiftCtx sctx; 1710 PetscInt newshift; 1711 1712 PetscFunctionBegin; 1713 sctx.shift_top = 0; 1714 sctx.nshift_max = 0; 1715 sctx.shift_lo = 0; 1716 sctx.shift_hi = 0; 1717 sctx.shift_fraction = 0; 1718 1719 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1720 if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1721 sctx.shift_top = 0; 1722 for (i=0; i<n; i++) { 1723 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1724 rs = 0.0; 1725 ajtmp = aj + ai[i]; 1726 rtmp1 = aa + ai[i]; 1727 nz = ai[i+1] - ai[i]; 1728 for (j=0; j<nz; j++){ 1729 if (*ajtmp != i){ 1730 rs += PetscAbsScalar(*rtmp1++); 1731 } else { 1732 rs -= PetscRealPart(*rtmp1++); 1733 } 1734 ajtmp++; 1735 } 1736 if (rs>sctx.shift_top) sctx.shift_top = rs; 1737 } 1738 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1739 sctx.shift_top *= 1.1; 1740 sctx.nshift_max = 5; 1741 sctx.shift_lo = 0.; 1742 sctx.shift_hi = 1.; 1743 } 1744 sctx.shift_amount = 0; 1745 sctx.nshift = 0; 1746 1747 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1748 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1749 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1750 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1751 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1752 ics = ic ; 1753 rtmp22 = rtmp11 + n; 1754 rtmp33 = rtmp22 + n; 1755 1756 node_max = a->inode.node_count; 1757 ns = a->inode.size; 1758 if (!ns){ 1759 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1760 } 1761 1762 /* If max inode size > 3, split it into two inodes.*/ 1763 /* also map the inode sizes according to the ordering */ 1764 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1765 for (i=0,j=0; i<node_max; ++i,++j){ 1766 if (ns[i]>3) { 1767 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1768 ++j; 1769 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1770 } else { 1771 tmp_vec1[j] = ns[i]; 1772 } 1773 } 1774 /* Use the correct node_max */ 1775 node_max = j; 1776 1777 /* Now reorder the inode info based on mat re-ordering info */ 1778 /* First create a row -> inode_size_array_index map */ 1779 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1780 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1781 for (i=0,row=0; i<node_max; i++) { 1782 nodesz = tmp_vec1[i]; 1783 for (j=0; j<nodesz; j++,row++) { 1784 nsmap[row] = i; 1785 } 1786 } 1787 /* Using nsmap, create a reordered ns structure */ 1788 for (i=0,j=0; i< node_max; i++) { 1789 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1790 tmp_vec2[i] = nodesz; 1791 j += nodesz; 1792 } 1793 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1794 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1795 /* Now use the correct ns */ 1796 ns = tmp_vec2; 1797 1798 do { 1799 sctx.useshift = PETSC_FALSE; 1800 /* Now loop over each block-row, and do the factorization */ 1801 for (i=0,row=0; i<node_max; i++) { 1802 nodesz = ns[i]; 1803 nz = bi[row+1] - bi[row]; 1804 bjtmp = bj + bi[row]; 1805 1806 switch (nodesz){ 1807 case 1: 1808 for (j=0; j<nz; j++){ 1809 idx = bjtmp[j]; 1810 rtmp11[idx] = 0.0; 1811 } 1812 1813 /* load in initial (unfactored row) */ 1814 idx = r[row]; 1815 nz_tmp = ai[idx+1] - ai[idx]; 1816 ajtmp = aj + ai[idx]; 1817 v1 = aa + ai[idx]; 1818 1819 for (j=0; j<nz_tmp; j++) { 1820 idx = ics[ajtmp[j]]; 1821 rtmp11[idx] = v1[j]; 1822 } 1823 rtmp11[ics[r[row]]] += sctx.shift_amount; 1824 1825 prow = *bjtmp++ ; 1826 while (prow < row) { 1827 pc1 = rtmp11 + prow; 1828 if (*pc1 != 0.0){ 1829 pv = ba + bd[prow]; 1830 pj = nbj + bd[prow]; 1831 mul1 = *pc1 * *pv++; 1832 *pc1 = mul1; 1833 nz_tmp = bi[prow+1] - bd[prow] - 1; 1834 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1835 for (j=0; j<nz_tmp; j++) { 1836 tmp = pv[j]; 1837 idx = pj[j]; 1838 rtmp11[idx] -= mul1 * tmp; 1839 } 1840 } 1841 prow = *bjtmp++ ; 1842 } 1843 pj = bj + bi[row]; 1844 pc1 = ba + bi[row]; 1845 1846 sctx.pv = rtmp11[row]; 1847 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1848 rs = 0.0; 1849 for (j=0; j<nz; j++) { 1850 idx = pj[j]; 1851 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1852 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1853 } 1854 sctx.rs = rs; 1855 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1856 if (newshift == 1) goto endofwhile; 1857 break; 1858 1859 case 2: 1860 for (j=0; j<nz; j++) { 1861 idx = bjtmp[j]; 1862 rtmp11[idx] = 0.0; 1863 rtmp22[idx] = 0.0; 1864 } 1865 1866 /* load in initial (unfactored row) */ 1867 idx = r[row]; 1868 nz_tmp = ai[idx+1] - ai[idx]; 1869 ajtmp = aj + ai[idx]; 1870 v1 = aa + ai[idx]; 1871 v2 = aa + ai[idx+1]; 1872 for (j=0; j<nz_tmp; j++) { 1873 idx = ics[ajtmp[j]]; 1874 rtmp11[idx] = v1[j]; 1875 rtmp22[idx] = v2[j]; 1876 } 1877 rtmp11[ics[r[row]]] += sctx.shift_amount; 1878 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1879 1880 prow = *bjtmp++ ; 1881 while (prow < row) { 1882 pc1 = rtmp11 + prow; 1883 pc2 = rtmp22 + prow; 1884 if (*pc1 != 0.0 || *pc2 != 0.0){ 1885 pv = ba + bd[prow]; 1886 pj = nbj + bd[prow]; 1887 mul1 = *pc1 * *pv; 1888 mul2 = *pc2 * *pv; 1889 ++pv; 1890 *pc1 = mul1; 1891 *pc2 = mul2; 1892 1893 nz_tmp = bi[prow+1] - bd[prow] - 1; 1894 for (j=0; j<nz_tmp; j++) { 1895 tmp = pv[j]; 1896 idx = pj[j]; 1897 rtmp11[idx] -= mul1 * tmp; 1898 rtmp22[idx] -= mul2 * tmp; 1899 } 1900 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1901 } 1902 prow = *bjtmp++ ; 1903 } 1904 1905 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1906 pc1 = rtmp11 + prow; 1907 pc2 = rtmp22 + prow; 1908 1909 sctx.pv = *pc1; 1910 pj = bj + bi[prow]; 1911 rs = 0.0; 1912 for (j=0; j<nz; j++){ 1913 idx = pj[j]; 1914 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1915 } 1916 sctx.rs = rs; 1917 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1918 if (newshift == 1) goto endofwhile; 1919 1920 if (*pc2 != 0.0){ 1921 pj = nbj + bd[prow]; 1922 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1923 *pc2 = mul2; 1924 nz_tmp = bi[prow+1] - bd[prow] - 1; 1925 for (j=0; j<nz_tmp; j++) { 1926 idx = pj[j] ; 1927 tmp = rtmp11[idx]; 1928 rtmp22[idx] -= mul2 * tmp; 1929 } 1930 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1931 } 1932 1933 pj = bj + bi[row]; 1934 pc1 = ba + bi[row]; 1935 pc2 = ba + bi[row+1]; 1936 1937 sctx.pv = rtmp22[row+1]; 1938 rs = 0.0; 1939 rtmp11[row] = 1.0/rtmp11[row]; 1940 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1941 /* copy row entries from dense representation to sparse */ 1942 for (j=0; j<nz; j++) { 1943 idx = pj[j]; 1944 pc1[j] = rtmp11[idx]; 1945 pc2[j] = rtmp22[idx]; 1946 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1947 } 1948 sctx.rs = rs; 1949 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1950 if (newshift == 1) goto endofwhile; 1951 break; 1952 1953 case 3: 1954 for (j=0; j<nz; j++) { 1955 idx = bjtmp[j]; 1956 rtmp11[idx] = 0.0; 1957 rtmp22[idx] = 0.0; 1958 rtmp33[idx] = 0.0; 1959 } 1960 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1961 idx = r[row]; 1962 nz_tmp = ai[idx+1] - ai[idx]; 1963 ajtmp = aj + ai[idx]; 1964 v1 = aa + ai[idx]; 1965 v2 = aa + ai[idx+1]; 1966 v3 = aa + ai[idx+2]; 1967 for (j=0; j<nz_tmp; j++) { 1968 idx = ics[ajtmp[j]]; 1969 rtmp11[idx] = v1[j]; 1970 rtmp22[idx] = v2[j]; 1971 rtmp33[idx] = v3[j]; 1972 } 1973 rtmp11[ics[r[row]]] += sctx.shift_amount; 1974 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1975 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1976 1977 /* loop over all pivot row blocks above this row block */ 1978 prow = *bjtmp++ ; 1979 while (prow < row) { 1980 pc1 = rtmp11 + prow; 1981 pc2 = rtmp22 + prow; 1982 pc3 = rtmp33 + prow; 1983 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1984 pv = ba + bd[prow]; 1985 pj = nbj + bd[prow]; 1986 mul1 = *pc1 * *pv; 1987 mul2 = *pc2 * *pv; 1988 mul3 = *pc3 * *pv; 1989 ++pv; 1990 *pc1 = mul1; 1991 *pc2 = mul2; 1992 *pc3 = mul3; 1993 1994 nz_tmp = bi[prow+1] - bd[prow] - 1; 1995 /* update this row based on pivot row */ 1996 for (j=0; j<nz_tmp; j++) { 1997 tmp = pv[j]; 1998 idx = pj[j]; 1999 rtmp11[idx] -= mul1 * tmp; 2000 rtmp22[idx] -= mul2 * tmp; 2001 rtmp33[idx] -= mul3 * tmp; 2002 } 2003 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2004 } 2005 prow = *bjtmp++ ; 2006 } 2007 2008 /* Now take care of diagonal 3x3 block in this set of rows */ 2009 /* note: prow = row here */ 2010 pc1 = rtmp11 + prow; 2011 pc2 = rtmp22 + prow; 2012 pc3 = rtmp33 + prow; 2013 2014 sctx.pv = *pc1; 2015 pj = bj + bi[prow]; 2016 rs = 0.0; 2017 for (j=0; j<nz; j++){ 2018 idx = pj[j]; 2019 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2020 } 2021 sctx.rs = rs; 2022 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2023 if (newshift == 1) goto endofwhile; 2024 2025 if (*pc2 != 0.0 || *pc3 != 0.0){ 2026 mul2 = (*pc2)/(*pc1); 2027 mul3 = (*pc3)/(*pc1); 2028 *pc2 = mul2; 2029 *pc3 = mul3; 2030 nz_tmp = bi[prow+1] - bd[prow] - 1; 2031 pj = nbj + bd[prow]; 2032 for (j=0; j<nz_tmp; j++) { 2033 idx = pj[j] ; 2034 tmp = rtmp11[idx]; 2035 rtmp22[idx] -= mul2 * tmp; 2036 rtmp33[idx] -= mul3 * tmp; 2037 } 2038 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2039 } 2040 ++prow; 2041 2042 pc2 = rtmp22 + prow; 2043 pc3 = rtmp33 + prow; 2044 sctx.pv = *pc2; 2045 pj = bj + bi[prow]; 2046 rs = 0.0; 2047 for (j=0; j<nz; j++){ 2048 idx = pj[j]; 2049 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2050 } 2051 sctx.rs = rs; 2052 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2053 if (newshift == 1) goto endofwhile; 2054 2055 if (*pc3 != 0.0){ 2056 mul3 = (*pc3)/(*pc2); 2057 *pc3 = mul3; 2058 pj = nbj + bd[prow]; 2059 nz_tmp = bi[prow+1] - bd[prow] - 1; 2060 for (j=0; j<nz_tmp; j++) { 2061 idx = pj[j] ; 2062 tmp = rtmp22[idx]; 2063 rtmp33[idx] -= mul3 * tmp; 2064 } 2065 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2066 } 2067 2068 pj = bj + bi[row]; 2069 pc1 = ba + bi[row]; 2070 pc2 = ba + bi[row+1]; 2071 pc3 = ba + bi[row+2]; 2072 2073 sctx.pv = rtmp33[row+2]; 2074 rs = 0.0; 2075 rtmp11[row] = 1.0/rtmp11[row]; 2076 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2077 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2078 /* copy row entries from dense representation to sparse */ 2079 for (j=0; j<nz; j++) { 2080 idx = pj[j]; 2081 pc1[j] = rtmp11[idx]; 2082 pc2[j] = rtmp22[idx]; 2083 pc3[j] = rtmp33[idx]; 2084 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2085 } 2086 2087 sctx.rs = rs; 2088 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2089 if (newshift == 1) goto endofwhile; 2090 break; 2091 2092 default: 2093 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2094 } 2095 row += nodesz; /* Update the row */ 2096 } 2097 endofwhile:; 2098 } while (sctx.useshift); 2099 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2100 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2101 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2102 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2103 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2104 (B)->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 2105 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2106 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2107 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2108 C->assembled = PETSC_TRUE; 2109 C->preallocated = PETSC_TRUE; 2110 if (sctx.nshift) { 2111 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 2112 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2113 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 2114 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2115 } 2116 } 2117 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2118 PetscFunctionReturn(0); 2119 } 2120 2121 2122 /* ----------------------------------------------------------- */ 2123 #undef __FUNCT__ 2124 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2125 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2126 { 2127 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2128 IS iscol = a->col,isrow = a->row; 2129 PetscErrorCode ierr; 2130 const PetscInt *r,*c,*rout,*cout; 2131 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2132 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2133 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2134 PetscScalar sum1,sum2,sum3,sum4,sum5; 2135 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2136 const PetscScalar *b; 2137 2138 PetscFunctionBegin; 2139 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2140 node_max = a->inode.node_count; 2141 ns = a->inode.size; /* Node Size array */ 2142 2143 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2144 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2145 tmp = a->solve_work; 2146 2147 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2148 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2149 2150 /* forward solve the lower triangular */ 2151 tmps = tmp ; 2152 aa = a_a ; 2153 aj = a_j ; 2154 ad = a->diag; 2155 2156 for (i = 0,row = 0; i< node_max; ++i){ 2157 nsz = ns[i]; 2158 aii = ai[row]; 2159 v1 = aa + aii; 2160 vi = aj + aii; 2161 nz = ai[row+1]- ai[row]; 2162 2163 if (i < node_max-1) { 2164 /* Prefetch the indices for the next block */ 2165 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2166 /* Prefetch the data for the next block */ 2167 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2168 } 2169 2170 switch (nsz){ /* Each loop in 'case' is unrolled */ 2171 case 1 : 2172 sum1 = b[r[row]]; 2173 for(j=0; j<nz-1; j+=2){ 2174 i0 = vi[j]; 2175 i1 = vi[j+1]; 2176 tmp0 = tmps[i0]; 2177 tmp1 = tmps[i1]; 2178 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2179 } 2180 if(j == nz-1){ 2181 tmp0 = tmps[vi[j]]; 2182 sum1 -= v1[j]*tmp0; 2183 } 2184 tmp[row++]=sum1; 2185 break; 2186 case 2: 2187 sum1 = b[r[row]]; 2188 sum2 = b[r[row+1]]; 2189 v2 = aa + ai[row+1]; 2190 2191 for(j=0; j<nz-1; j+=2){ 2192 i0 = vi[j]; 2193 i1 = vi[j+1]; 2194 tmp0 = tmps[i0]; 2195 tmp1 = tmps[i1]; 2196 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2197 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2198 } 2199 if(j == nz-1){ 2200 tmp0 = tmps[vi[j]]; 2201 sum1 -= v1[j] *tmp0; 2202 sum2 -= v2[j] *tmp0; 2203 } 2204 sum2 -= v2[nz] * sum1; 2205 tmp[row ++]=sum1; 2206 tmp[row ++]=sum2; 2207 break; 2208 case 3: 2209 sum1 = b[r[row]]; 2210 sum2 = b[r[row+1]]; 2211 sum3 = b[r[row+2]]; 2212 v2 = aa + ai[row+1]; 2213 v3 = aa + ai[row+2]; 2214 2215 for (j=0; j<nz-1; j+=2){ 2216 i0 = vi[j]; 2217 i1 = vi[j+1]; 2218 tmp0 = tmps[i0]; 2219 tmp1 = tmps[i1]; 2220 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2221 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2222 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2223 } 2224 if (j == nz-1){ 2225 tmp0 = tmps[vi[j]]; 2226 sum1 -= v1[j] *tmp0; 2227 sum2 -= v2[j] *tmp0; 2228 sum3 -= v3[j] *tmp0; 2229 } 2230 sum2 -= v2[nz] * sum1; 2231 sum3 -= v3[nz] * sum1; 2232 sum3 -= v3[nz+1] * sum2; 2233 tmp[row ++]=sum1; 2234 tmp[row ++]=sum2; 2235 tmp[row ++]=sum3; 2236 break; 2237 2238 case 4: 2239 sum1 = b[r[row]]; 2240 sum2 = b[r[row+1]]; 2241 sum3 = b[r[row+2]]; 2242 sum4 = b[r[row+3]]; 2243 v2 = aa + ai[row+1]; 2244 v3 = aa + ai[row+2]; 2245 v4 = aa + ai[row+3]; 2246 2247 for (j=0; j<nz-1; j+=2){ 2248 i0 = vi[j]; 2249 i1 = vi[j+1]; 2250 tmp0 = tmps[i0]; 2251 tmp1 = tmps[i1]; 2252 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2253 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2254 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2255 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2256 } 2257 if (j == nz-1){ 2258 tmp0 = tmps[vi[j]]; 2259 sum1 -= v1[j] *tmp0; 2260 sum2 -= v2[j] *tmp0; 2261 sum3 -= v3[j] *tmp0; 2262 sum4 -= v4[j] *tmp0; 2263 } 2264 sum2 -= v2[nz] * sum1; 2265 sum3 -= v3[nz] * sum1; 2266 sum4 -= v4[nz] * sum1; 2267 sum3 -= v3[nz+1] * sum2; 2268 sum4 -= v4[nz+1] * sum2; 2269 sum4 -= v4[nz+2] * sum3; 2270 2271 tmp[row ++]=sum1; 2272 tmp[row ++]=sum2; 2273 tmp[row ++]=sum3; 2274 tmp[row ++]=sum4; 2275 break; 2276 case 5: 2277 sum1 = b[r[row]]; 2278 sum2 = b[r[row+1]]; 2279 sum3 = b[r[row+2]]; 2280 sum4 = b[r[row+3]]; 2281 sum5 = b[r[row+4]]; 2282 v2 = aa + ai[row+1]; 2283 v3 = aa + ai[row+2]; 2284 v4 = aa + ai[row+3]; 2285 v5 = aa + ai[row+4]; 2286 2287 for (j=0; j<nz-1; j+=2){ 2288 i0 = vi[j]; 2289 i1 = vi[j+1]; 2290 tmp0 = tmps[i0]; 2291 tmp1 = tmps[i1]; 2292 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2293 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2294 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2295 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2296 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2297 } 2298 if (j == nz-1){ 2299 tmp0 = tmps[vi[j]]; 2300 sum1 -= v1[j] *tmp0; 2301 sum2 -= v2[j] *tmp0; 2302 sum3 -= v3[j] *tmp0; 2303 sum4 -= v4[j] *tmp0; 2304 sum5 -= v5[j] *tmp0; 2305 } 2306 2307 sum2 -= v2[nz] * sum1; 2308 sum3 -= v3[nz] * sum1; 2309 sum4 -= v4[nz] * sum1; 2310 sum5 -= v5[nz] * sum1; 2311 sum3 -= v3[nz+1] * sum2; 2312 sum4 -= v4[nz+1] * sum2; 2313 sum5 -= v5[nz+1] * sum2; 2314 sum4 -= v4[nz+2] * sum3; 2315 sum5 -= v5[nz+2] * sum3; 2316 sum5 -= v5[nz+3] * sum4; 2317 2318 tmp[row ++]=sum1; 2319 tmp[row ++]=sum2; 2320 tmp[row ++]=sum3; 2321 tmp[row ++]=sum4; 2322 tmp[row ++]=sum5; 2323 break; 2324 default: 2325 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2326 } 2327 } 2328 /* backward solve the upper triangular */ 2329 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2330 nsz = ns[i]; 2331 aii = ad[row+1] + 1; 2332 v1 = aa + aii; 2333 vi = aj + aii; 2334 nz = ad[row]- ad[row+1] - 1; 2335 2336 if (i > 0) { 2337 /* Prefetch the indices for the next block */ 2338 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2339 /* Prefetch the data for the next block */ 2340 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2341 } 2342 2343 switch (nsz){ /* Each loop in 'case' is unrolled */ 2344 case 1 : 2345 sum1 = tmp[row]; 2346 2347 for(j=0 ; j<nz-1; j+=2){ 2348 i0 = vi[j]; 2349 i1 = vi[j+1]; 2350 tmp0 = tmps[i0]; 2351 tmp1 = tmps[i1]; 2352 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2353 } 2354 if (j == nz-1){ 2355 tmp0 = tmps[vi[j]]; 2356 sum1 -= v1[j]*tmp0; 2357 } 2358 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2359 break; 2360 case 2 : 2361 sum1 = tmp[row]; 2362 sum2 = tmp[row-1]; 2363 v2 = aa + ad[row] + 1; 2364 for (j=0 ; j<nz-1; j+=2){ 2365 i0 = vi[j]; 2366 i1 = vi[j+1]; 2367 tmp0 = tmps[i0]; 2368 tmp1 = tmps[i1]; 2369 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2370 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2371 } 2372 if (j == nz-1){ 2373 tmp0 = tmps[vi[j]]; 2374 sum1 -= v1[j]* tmp0; 2375 sum2 -= v2[j+1]* tmp0; 2376 } 2377 2378 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2379 sum2 -= v2[0] * tmp0; 2380 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2381 break; 2382 case 3 : 2383 sum1 = tmp[row]; 2384 sum2 = tmp[row -1]; 2385 sum3 = tmp[row -2]; 2386 v2 = aa + ad[row] + 1; 2387 v3 = aa + ad[row -1] + 1; 2388 for (j=0 ; j<nz-1; j+=2){ 2389 i0 = vi[j]; 2390 i1 = vi[j+1]; 2391 tmp0 = tmps[i0]; 2392 tmp1 = tmps[i1]; 2393 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2394 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2395 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2396 } 2397 if (j== nz-1){ 2398 tmp0 = tmps[vi[j]]; 2399 sum1 -= v1[j] * tmp0; 2400 sum2 -= v2[j+1] * tmp0; 2401 sum3 -= v3[j+2] * tmp0; 2402 } 2403 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2404 sum2 -= v2[0]* tmp0; 2405 sum3 -= v3[1] * tmp0; 2406 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2407 sum3 -= v3[0]* tmp0; 2408 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2409 2410 break; 2411 case 4 : 2412 sum1 = tmp[row]; 2413 sum2 = tmp[row -1]; 2414 sum3 = tmp[row -2]; 2415 sum4 = tmp[row -3]; 2416 v2 = aa + ad[row]+1; 2417 v3 = aa + ad[row -1]+1; 2418 v4 = aa + ad[row -2]+1; 2419 2420 for (j=0 ; j<nz-1; j+=2){ 2421 i0 = vi[j]; 2422 i1 = vi[j+1]; 2423 tmp0 = tmps[i0]; 2424 tmp1 = tmps[i1]; 2425 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2426 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2427 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2428 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2429 } 2430 if (j== nz-1){ 2431 tmp0 = tmps[vi[j]]; 2432 sum1 -= v1[j] * tmp0; 2433 sum2 -= v2[j+1] * tmp0; 2434 sum3 -= v3[j+2] * tmp0; 2435 sum4 -= v4[j+3] * tmp0; 2436 } 2437 2438 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2439 sum2 -= v2[0] * tmp0; 2440 sum3 -= v3[1] * tmp0; 2441 sum4 -= v4[2] * tmp0; 2442 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2443 sum3 -= v3[0] * tmp0; 2444 sum4 -= v4[1] * tmp0; 2445 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2446 sum4 -= v4[0] * tmp0; 2447 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2448 break; 2449 case 5 : 2450 sum1 = tmp[row]; 2451 sum2 = tmp[row -1]; 2452 sum3 = tmp[row -2]; 2453 sum4 = tmp[row -3]; 2454 sum5 = tmp[row -4]; 2455 v2 = aa + ad[row]+1; 2456 v3 = aa + ad[row -1]+1; 2457 v4 = aa + ad[row -2]+1; 2458 v5 = aa + ad[row -3]+1; 2459 for (j=0 ; j<nz-1; j+=2){ 2460 i0 = vi[j]; 2461 i1 = vi[j+1]; 2462 tmp0 = tmps[i0]; 2463 tmp1 = tmps[i1]; 2464 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2465 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2466 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2467 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2468 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2469 } 2470 if (j==nz-1){ 2471 tmp0 = tmps[vi[j]]; 2472 sum1 -= v1[j] * tmp0; 2473 sum2 -= v2[j+1] * tmp0; 2474 sum3 -= v3[j+2] * tmp0; 2475 sum4 -= v4[j+3] * tmp0; 2476 sum5 -= v5[j+4] * tmp0; 2477 } 2478 2479 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2480 sum2 -= v2[0] * tmp0; 2481 sum3 -= v3[1] * tmp0; 2482 sum4 -= v4[2] * tmp0; 2483 sum5 -= v5[3] * tmp0; 2484 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2485 sum3 -= v3[0] * tmp0; 2486 sum4 -= v4[1] * tmp0; 2487 sum5 -= v5[2] * tmp0; 2488 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2489 sum4 -= v4[0] * tmp0; 2490 sum5 -= v5[1] * tmp0; 2491 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2492 sum5 -= v5[0] * tmp0; 2493 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2494 break; 2495 default: 2496 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2497 } 2498 } 2499 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2500 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2501 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2502 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2503 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2504 PetscFunctionReturn(0); 2505 } 2506 2507 2508 /* 2509 Makes a longer coloring[] array and calls the usual code with that 2510 */ 2511 #undef __FUNCT__ 2512 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2513 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2514 { 2515 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2516 PetscErrorCode ierr; 2517 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2518 PetscInt *colorused,i; 2519 ISColoringValue *newcolor; 2520 2521 PetscFunctionBegin; 2522 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2523 /* loop over inodes, marking a color for each column*/ 2524 row = 0; 2525 for (i=0; i<m; i++){ 2526 for (j=0; j<ns[i]; j++) { 2527 newcolor[row++] = coloring[i] + j*ncolors; 2528 } 2529 } 2530 2531 /* eliminate unneeded colors */ 2532 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2533 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2534 for (i=0; i<n; i++) { 2535 colorused[newcolor[i]] = 1; 2536 } 2537 2538 for (i=1; i<5*ncolors; i++) { 2539 colorused[i] += colorused[i-1]; 2540 } 2541 ncolors = colorused[5*ncolors-1]; 2542 for (i=0; i<n; i++) { 2543 newcolor[i] = colorused[newcolor[i]]-1; 2544 } 2545 ierr = PetscFree(colorused);CHKERRQ(ierr); 2546 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2547 ierr = PetscFree(coloring);CHKERRQ(ierr); 2548 PetscFunctionReturn(0); 2549 } 2550 2551 #include "../src/mat/blockinvert.h" 2552 2553 #undef __FUNCT__ 2554 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2555 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2556 { 2557 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2558 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2559 MatScalar *ibdiag,*bdiag,work[25]; 2560 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2561 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2562 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2563 PetscErrorCode ierr; 2564 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2565 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2566 2567 PetscFunctionBegin; 2568 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2569 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2570 if (its > 1) { 2571 /* switch to non-inode version */ 2572 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2573 PetscFunctionReturn(0); 2574 } 2575 2576 if (!a->inode.ibdiagvalid) { 2577 if (!a->inode.ibdiag) { 2578 /* calculate space needed for diagonal blocks */ 2579 for (i=0; i<m; i++) { 2580 cnt += sizes[i]*sizes[i]; 2581 } 2582 a->inode.bdiagsize = cnt; 2583 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2584 } 2585 2586 /* copy over the diagonal blocks and invert them */ 2587 ibdiag = a->inode.ibdiag; 2588 bdiag = a->inode.bdiag; 2589 cnt = 0; 2590 for (i=0, row = 0; i<m; i++) { 2591 for (j=0; j<sizes[i]; j++) { 2592 for (k=0; k<sizes[i]; k++) { 2593 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2594 } 2595 } 2596 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2597 2598 switch(sizes[i]) { 2599 case 1: 2600 /* Create matrix data structure */ 2601 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2602 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2603 break; 2604 case 2: 2605 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2606 break; 2607 case 3: 2608 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2609 break; 2610 case 4: 2611 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2612 break; 2613 case 5: 2614 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2615 break; 2616 default: 2617 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2618 } 2619 cnt += sizes[i]*sizes[i]; 2620 row += sizes[i]; 2621 } 2622 a->inode.ibdiagvalid = PETSC_TRUE; 2623 } 2624 ibdiag = a->inode.ibdiag; 2625 bdiag = a->inode.bdiag; 2626 2627 2628 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2629 if (flag & SOR_ZERO_INITIAL_GUESS) { 2630 PetscScalar *b; 2631 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2632 if (xx != bb) { 2633 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2634 } else { 2635 b = x; 2636 } 2637 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2638 2639 for (i=0, row=0; i<m; i++) { 2640 sz = diag[row] - ii[row]; 2641 v1 = a->a + ii[row]; 2642 idx = a->j + ii[row]; 2643 2644 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2645 switch (sizes[i]){ 2646 case 1: 2647 2648 sum1 = b[row]; 2649 for(n = 0; n<sz-1; n+=2) { 2650 i1 = idx[0]; 2651 i2 = idx[1]; 2652 idx += 2; 2653 tmp0 = x[i1]; 2654 tmp1 = x[i2]; 2655 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2656 } 2657 2658 if (n == sz-1){ 2659 tmp0 = x[*idx]; 2660 sum1 -= *v1 * tmp0; 2661 } 2662 x[row++] = sum1*(*ibdiag++); 2663 break; 2664 case 2: 2665 v2 = a->a + ii[row+1]; 2666 sum1 = b[row]; 2667 sum2 = b[row+1]; 2668 for(n = 0; n<sz-1; n+=2) { 2669 i1 = idx[0]; 2670 i2 = idx[1]; 2671 idx += 2; 2672 tmp0 = x[i1]; 2673 tmp1 = x[i2]; 2674 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2675 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2676 } 2677 2678 if (n == sz-1){ 2679 tmp0 = x[*idx]; 2680 sum1 -= v1[0] * tmp0; 2681 sum2 -= v2[0] * tmp0; 2682 } 2683 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2684 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2685 ibdiag += 4; 2686 break; 2687 case 3: 2688 v2 = a->a + ii[row+1]; 2689 v3 = a->a + ii[row+2]; 2690 sum1 = b[row]; 2691 sum2 = b[row+1]; 2692 sum3 = b[row+2]; 2693 for(n = 0; n<sz-1; n+=2) { 2694 i1 = idx[0]; 2695 i2 = idx[1]; 2696 idx += 2; 2697 tmp0 = x[i1]; 2698 tmp1 = x[i2]; 2699 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2700 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2701 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2702 } 2703 2704 if (n == sz-1){ 2705 tmp0 = x[*idx]; 2706 sum1 -= v1[0] * tmp0; 2707 sum2 -= v2[0] * tmp0; 2708 sum3 -= v3[0] * tmp0; 2709 } 2710 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2711 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2712 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2713 ibdiag += 9; 2714 break; 2715 case 4: 2716 v2 = a->a + ii[row+1]; 2717 v3 = a->a + ii[row+2]; 2718 v4 = a->a + ii[row+3]; 2719 sum1 = b[row]; 2720 sum2 = b[row+1]; 2721 sum3 = b[row+2]; 2722 sum4 = b[row+3]; 2723 for(n = 0; n<sz-1; n+=2) { 2724 i1 = idx[0]; 2725 i2 = idx[1]; 2726 idx += 2; 2727 tmp0 = x[i1]; 2728 tmp1 = x[i2]; 2729 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2730 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2731 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2732 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2733 } 2734 2735 if (n == sz-1){ 2736 tmp0 = x[*idx]; 2737 sum1 -= v1[0] * tmp0; 2738 sum2 -= v2[0] * tmp0; 2739 sum3 -= v3[0] * tmp0; 2740 sum4 -= v4[0] * tmp0; 2741 } 2742 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2743 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2744 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2745 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2746 ibdiag += 16; 2747 break; 2748 case 5: 2749 v2 = a->a + ii[row+1]; 2750 v3 = a->a + ii[row+2]; 2751 v4 = a->a + ii[row+3]; 2752 v5 = a->a + ii[row+4]; 2753 sum1 = b[row]; 2754 sum2 = b[row+1]; 2755 sum3 = b[row+2]; 2756 sum4 = b[row+3]; 2757 sum5 = b[row+4]; 2758 for(n = 0; n<sz-1; n+=2) { 2759 i1 = idx[0]; 2760 i2 = idx[1]; 2761 idx += 2; 2762 tmp0 = x[i1]; 2763 tmp1 = x[i2]; 2764 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2765 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2766 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2767 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2768 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2769 } 2770 2771 if (n == sz-1){ 2772 tmp0 = x[*idx]; 2773 sum1 -= v1[0] * tmp0; 2774 sum2 -= v2[0] * tmp0; 2775 sum3 -= v3[0] * tmp0; 2776 sum4 -= v4[0] * tmp0; 2777 sum5 -= v5[0] * tmp0; 2778 } 2779 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2780 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2781 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2782 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2783 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2784 ibdiag += 25; 2785 break; 2786 default: 2787 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2788 } 2789 } 2790 2791 xb = x; 2792 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2793 } else xb = b; 2794 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2795 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2796 cnt = 0; 2797 for (i=0, row=0; i<m; i++) { 2798 2799 switch (sizes[i]){ 2800 case 1: 2801 x[row++] *= bdiag[cnt++]; 2802 break; 2803 case 2: 2804 x1 = x[row]; x2 = x[row+1]; 2805 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2806 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2807 x[row++] = tmp1; 2808 x[row++] = tmp2; 2809 cnt += 4; 2810 break; 2811 case 3: 2812 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2813 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2814 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2815 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2816 x[row++] = tmp1; 2817 x[row++] = tmp2; 2818 x[row++] = tmp3; 2819 cnt += 9; 2820 break; 2821 case 4: 2822 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2823 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2824 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2825 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2826 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2827 x[row++] = tmp1; 2828 x[row++] = tmp2; 2829 x[row++] = tmp3; 2830 x[row++] = tmp4; 2831 cnt += 16; 2832 break; 2833 case 5: 2834 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2835 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2836 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2837 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2838 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2839 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2840 x[row++] = tmp1; 2841 x[row++] = tmp2; 2842 x[row++] = tmp3; 2843 x[row++] = tmp4; 2844 x[row++] = tmp5; 2845 cnt += 25; 2846 break; 2847 default: 2848 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2849 } 2850 } 2851 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2852 } 2853 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2854 2855 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2856 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2857 ibdiag -= sizes[i]*sizes[i]; 2858 sz = ii[row+1] - diag[row] - 1; 2859 v1 = a->a + diag[row] + 1; 2860 idx = a->j + diag[row] + 1; 2861 2862 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2863 switch (sizes[i]){ 2864 case 1: 2865 2866 sum1 = xb[row]; 2867 for(n = 0; n<sz-1; n+=2) { 2868 i1 = idx[0]; 2869 i2 = idx[1]; 2870 idx += 2; 2871 tmp0 = x[i1]; 2872 tmp1 = x[i2]; 2873 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2874 } 2875 2876 if (n == sz-1){ 2877 tmp0 = x[*idx]; 2878 sum1 -= *v1*tmp0; 2879 } 2880 x[row--] = sum1*(*ibdiag); 2881 break; 2882 2883 case 2: 2884 2885 sum1 = xb[row]; 2886 sum2 = xb[row-1]; 2887 /* note that sum1 is associated with the second of the two rows */ 2888 v2 = a->a + diag[row-1] + 2; 2889 for(n = 0; n<sz-1; n+=2) { 2890 i1 = idx[0]; 2891 i2 = idx[1]; 2892 idx += 2; 2893 tmp0 = x[i1]; 2894 tmp1 = x[i2]; 2895 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2896 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2897 } 2898 2899 if (n == sz-1){ 2900 tmp0 = x[*idx]; 2901 sum1 -= *v1*tmp0; 2902 sum2 -= *v2*tmp0; 2903 } 2904 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2905 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2906 break; 2907 case 3: 2908 2909 sum1 = xb[row]; 2910 sum2 = xb[row-1]; 2911 sum3 = xb[row-2]; 2912 v2 = a->a + diag[row-1] + 2; 2913 v3 = a->a + diag[row-2] + 3; 2914 for(n = 0; n<sz-1; n+=2) { 2915 i1 = idx[0]; 2916 i2 = idx[1]; 2917 idx += 2; 2918 tmp0 = x[i1]; 2919 tmp1 = x[i2]; 2920 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2921 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2922 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2923 } 2924 2925 if (n == sz-1){ 2926 tmp0 = x[*idx]; 2927 sum1 -= *v1*tmp0; 2928 sum2 -= *v2*tmp0; 2929 sum3 -= *v3*tmp0; 2930 } 2931 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2932 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2933 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2934 break; 2935 case 4: 2936 2937 sum1 = xb[row]; 2938 sum2 = xb[row-1]; 2939 sum3 = xb[row-2]; 2940 sum4 = xb[row-3]; 2941 v2 = a->a + diag[row-1] + 2; 2942 v3 = a->a + diag[row-2] + 3; 2943 v4 = a->a + diag[row-3] + 4; 2944 for(n = 0; n<sz-1; n+=2) { 2945 i1 = idx[0]; 2946 i2 = idx[1]; 2947 idx += 2; 2948 tmp0 = x[i1]; 2949 tmp1 = x[i2]; 2950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2951 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2952 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2953 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2954 } 2955 2956 if (n == sz-1){ 2957 tmp0 = x[*idx]; 2958 sum1 -= *v1*tmp0; 2959 sum2 -= *v2*tmp0; 2960 sum3 -= *v3*tmp0; 2961 sum4 -= *v4*tmp0; 2962 } 2963 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2964 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2965 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2966 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2967 break; 2968 case 5: 2969 2970 sum1 = xb[row]; 2971 sum2 = xb[row-1]; 2972 sum3 = xb[row-2]; 2973 sum4 = xb[row-3]; 2974 sum5 = xb[row-4]; 2975 v2 = a->a + diag[row-1] + 2; 2976 v3 = a->a + diag[row-2] + 3; 2977 v4 = a->a + diag[row-3] + 4; 2978 v5 = a->a + diag[row-4] + 5; 2979 for(n = 0; n<sz-1; n+=2) { 2980 i1 = idx[0]; 2981 i2 = idx[1]; 2982 idx += 2; 2983 tmp0 = x[i1]; 2984 tmp1 = x[i2]; 2985 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2986 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2987 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2988 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2989 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2990 } 2991 2992 if (n == sz-1){ 2993 tmp0 = x[*idx]; 2994 sum1 -= *v1*tmp0; 2995 sum2 -= *v2*tmp0; 2996 sum3 -= *v3*tmp0; 2997 sum4 -= *v4*tmp0; 2998 sum5 -= *v5*tmp0; 2999 } 3000 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3001 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3002 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3003 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3004 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3005 break; 3006 default: 3007 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3008 } 3009 } 3010 3011 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3012 } 3013 its--; 3014 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3015 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3016 } 3017 if (flag & SOR_EISENSTAT) { 3018 const PetscScalar *b; 3019 MatScalar *t = a->inode.ssor_work; 3020 3021 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3022 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3023 /* 3024 Apply (U + D)^-1 where D is now the block diagonal 3025 */ 3026 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3027 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3028 ibdiag -= sizes[i]*sizes[i]; 3029 sz = ii[row+1] - diag[row] - 1; 3030 v1 = a->a + diag[row] + 1; 3031 idx = a->j + diag[row] + 1; 3032 CHKMEMQ; 3033 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3034 switch (sizes[i]){ 3035 case 1: 3036 3037 sum1 = b[row]; 3038 for(n = 0; n<sz-1; n+=2) { 3039 i1 = idx[0]; 3040 i2 = idx[1]; 3041 idx += 2; 3042 tmp0 = x[i1]; 3043 tmp1 = x[i2]; 3044 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3045 } 3046 3047 if (n == sz-1){ 3048 tmp0 = x[*idx]; 3049 sum1 -= *v1*tmp0; 3050 } 3051 x[row] = sum1*(*ibdiag);row--; 3052 break; 3053 3054 case 2: 3055 3056 sum1 = b[row]; 3057 sum2 = b[row-1]; 3058 /* note that sum1 is associated with the second of the two rows */ 3059 v2 = a->a + diag[row-1] + 2; 3060 for(n = 0; n<sz-1; n+=2) { 3061 i1 = idx[0]; 3062 i2 = idx[1]; 3063 idx += 2; 3064 tmp0 = x[i1]; 3065 tmp1 = x[i2]; 3066 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3067 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3068 } 3069 3070 if (n == sz-1){ 3071 tmp0 = x[*idx]; 3072 sum1 -= *v1*tmp0; 3073 sum2 -= *v2*tmp0; 3074 } 3075 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3076 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3077 row -= 2; 3078 break; 3079 case 3: 3080 3081 sum1 = b[row]; 3082 sum2 = b[row-1]; 3083 sum3 = b[row-2]; 3084 v2 = a->a + diag[row-1] + 2; 3085 v3 = a->a + diag[row-2] + 3; 3086 for(n = 0; n<sz-1; n+=2) { 3087 i1 = idx[0]; 3088 i2 = idx[1]; 3089 idx += 2; 3090 tmp0 = x[i1]; 3091 tmp1 = x[i2]; 3092 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3093 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3094 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3095 } 3096 3097 if (n == sz-1){ 3098 tmp0 = x[*idx]; 3099 sum1 -= *v1*tmp0; 3100 sum2 -= *v2*tmp0; 3101 sum3 -= *v3*tmp0; 3102 } 3103 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3104 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3105 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3106 row -= 3; 3107 break; 3108 case 4: 3109 3110 sum1 = b[row]; 3111 sum2 = b[row-1]; 3112 sum3 = b[row-2]; 3113 sum4 = b[row-3]; 3114 v2 = a->a + diag[row-1] + 2; 3115 v3 = a->a + diag[row-2] + 3; 3116 v4 = a->a + diag[row-3] + 4; 3117 for(n = 0; n<sz-1; n+=2) { 3118 i1 = idx[0]; 3119 i2 = idx[1]; 3120 idx += 2; 3121 tmp0 = x[i1]; 3122 tmp1 = x[i2]; 3123 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3124 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3125 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3126 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3127 } 3128 3129 if (n == sz-1){ 3130 tmp0 = x[*idx]; 3131 sum1 -= *v1*tmp0; 3132 sum2 -= *v2*tmp0; 3133 sum3 -= *v3*tmp0; 3134 sum4 -= *v4*tmp0; 3135 } 3136 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3137 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3138 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3139 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3140 row -= 4; 3141 break; 3142 case 5: 3143 3144 sum1 = b[row]; 3145 sum2 = b[row-1]; 3146 sum3 = b[row-2]; 3147 sum4 = b[row-3]; 3148 sum5 = b[row-4]; 3149 v2 = a->a + diag[row-1] + 2; 3150 v3 = a->a + diag[row-2] + 3; 3151 v4 = a->a + diag[row-3] + 4; 3152 v5 = a->a + diag[row-4] + 5; 3153 for(n = 0; n<sz-1; n+=2) { 3154 i1 = idx[0]; 3155 i2 = idx[1]; 3156 idx += 2; 3157 tmp0 = x[i1]; 3158 tmp1 = x[i2]; 3159 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3160 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3161 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3162 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3163 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3164 } 3165 3166 if (n == sz-1){ 3167 tmp0 = x[*idx]; 3168 sum1 -= *v1*tmp0; 3169 sum2 -= *v2*tmp0; 3170 sum3 -= *v3*tmp0; 3171 sum4 -= *v4*tmp0; 3172 sum5 -= *v5*tmp0; 3173 } 3174 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3175 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3176 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3177 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3178 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3179 row -= 5; 3180 break; 3181 default: 3182 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3183 } 3184 CHKMEMQ; 3185 } 3186 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3187 3188 /* 3189 t = b - D x where D is the block diagonal 3190 */ 3191 cnt = 0; 3192 for (i=0, row=0; i<m; i++) { 3193 CHKMEMQ; 3194 switch (sizes[i]){ 3195 case 1: 3196 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3197 break; 3198 case 2: 3199 x1 = x[row]; x2 = x[row+1]; 3200 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3201 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3202 t[row] = b[row] - tmp1; 3203 t[row+1] = b[row+1] - tmp2; row += 2; 3204 cnt += 4; 3205 break; 3206 case 3: 3207 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3208 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3209 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3210 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3211 t[row] = b[row] - tmp1; 3212 t[row+1] = b[row+1] - tmp2; 3213 t[row+2] = b[row+2] - tmp3; row += 3; 3214 cnt += 9; 3215 break; 3216 case 4: 3217 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3218 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3219 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3220 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3221 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3222 t[row] = b[row] - tmp1; 3223 t[row+1] = b[row+1] - tmp2; 3224 t[row+2] = b[row+2] - tmp3; 3225 t[row+3] = b[row+3] - tmp4; row += 4; 3226 cnt += 16; 3227 break; 3228 case 5: 3229 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3230 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3231 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3232 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3233 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3234 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3235 t[row] = b[row] - tmp1; 3236 t[row+1] = b[row+1] - tmp2; 3237 t[row+2] = b[row+2] - tmp3; 3238 t[row+3] = b[row+3] - tmp4; 3239 t[row+4] = b[row+4] - tmp5;row += 5; 3240 cnt += 25; 3241 break; 3242 default: 3243 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3244 } 3245 CHKMEMQ; 3246 } 3247 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3248 3249 3250 3251 /* 3252 Apply (L + D)^-1 where D is the block diagonal 3253 */ 3254 for (i=0, row=0; i<m; i++) { 3255 sz = diag[row] - ii[row]; 3256 v1 = a->a + ii[row]; 3257 idx = a->j + ii[row]; 3258 CHKMEMQ; 3259 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3260 switch (sizes[i]){ 3261 case 1: 3262 3263 sum1 = t[row]; 3264 for(n = 0; n<sz-1; n+=2) { 3265 i1 = idx[0]; 3266 i2 = idx[1]; 3267 idx += 2; 3268 tmp0 = t[i1]; 3269 tmp1 = t[i2]; 3270 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3271 } 3272 3273 if (n == sz-1){ 3274 tmp0 = t[*idx]; 3275 sum1 -= *v1 * tmp0; 3276 } 3277 x[row] += t[row] = sum1*(*ibdiag++); row++; 3278 break; 3279 case 2: 3280 v2 = a->a + ii[row+1]; 3281 sum1 = t[row]; 3282 sum2 = t[row+1]; 3283 for(n = 0; n<sz-1; n+=2) { 3284 i1 = idx[0]; 3285 i2 = idx[1]; 3286 idx += 2; 3287 tmp0 = t[i1]; 3288 tmp1 = t[i2]; 3289 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3290 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3291 } 3292 3293 if (n == sz-1){ 3294 tmp0 = t[*idx]; 3295 sum1 -= v1[0] * tmp0; 3296 sum2 -= v2[0] * tmp0; 3297 } 3298 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3299 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3300 ibdiag += 4; row += 2; 3301 break; 3302 case 3: 3303 v2 = a->a + ii[row+1]; 3304 v3 = a->a + ii[row+2]; 3305 sum1 = t[row]; 3306 sum2 = t[row+1]; 3307 sum3 = t[row+2]; 3308 for(n = 0; n<sz-1; n+=2) { 3309 i1 = idx[0]; 3310 i2 = idx[1]; 3311 idx += 2; 3312 tmp0 = t[i1]; 3313 tmp1 = t[i2]; 3314 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3315 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3316 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3317 } 3318 3319 if (n == sz-1){ 3320 tmp0 = t[*idx]; 3321 sum1 -= v1[0] * tmp0; 3322 sum2 -= v2[0] * tmp0; 3323 sum3 -= v3[0] * tmp0; 3324 } 3325 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3326 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3327 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3328 ibdiag += 9; row += 3; 3329 break; 3330 case 4: 3331 v2 = a->a + ii[row+1]; 3332 v3 = a->a + ii[row+2]; 3333 v4 = a->a + ii[row+3]; 3334 sum1 = t[row]; 3335 sum2 = t[row+1]; 3336 sum3 = t[row+2]; 3337 sum4 = t[row+3]; 3338 for(n = 0; n<sz-1; n+=2) { 3339 i1 = idx[0]; 3340 i2 = idx[1]; 3341 idx += 2; 3342 tmp0 = t[i1]; 3343 tmp1 = t[i2]; 3344 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3345 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3346 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3347 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3348 } 3349 3350 if (n == sz-1){ 3351 tmp0 = t[*idx]; 3352 sum1 -= v1[0] * tmp0; 3353 sum2 -= v2[0] * tmp0; 3354 sum3 -= v3[0] * tmp0; 3355 sum4 -= v4[0] * tmp0; 3356 } 3357 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3358 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3359 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3360 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3361 ibdiag += 16; row += 4; 3362 break; 3363 case 5: 3364 v2 = a->a + ii[row+1]; 3365 v3 = a->a + ii[row+2]; 3366 v4 = a->a + ii[row+3]; 3367 v5 = a->a + ii[row+4]; 3368 sum1 = t[row]; 3369 sum2 = t[row+1]; 3370 sum3 = t[row+2]; 3371 sum4 = t[row+3]; 3372 sum5 = t[row+4]; 3373 for(n = 0; n<sz-1; n+=2) { 3374 i1 = idx[0]; 3375 i2 = idx[1]; 3376 idx += 2; 3377 tmp0 = t[i1]; 3378 tmp1 = t[i2]; 3379 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3380 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3381 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3382 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3383 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3384 } 3385 3386 if (n == sz-1){ 3387 tmp0 = t[*idx]; 3388 sum1 -= v1[0] * tmp0; 3389 sum2 -= v2[0] * tmp0; 3390 sum3 -= v3[0] * tmp0; 3391 sum4 -= v4[0] * tmp0; 3392 sum5 -= v5[0] * tmp0; 3393 } 3394 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3395 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3396 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3397 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3398 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3399 ibdiag += 25; row += 5; 3400 break; 3401 default: 3402 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3403 } 3404 CHKMEMQ; 3405 } 3406 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3407 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3408 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3409 } 3410 PetscFunctionReturn(0); 3411 } 3412 3413 #undef __FUNCT__ 3414 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3415 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3416 { 3417 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3418 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3419 const MatScalar *bdiag = a->inode.bdiag; 3420 const PetscScalar *b; 3421 PetscErrorCode ierr; 3422 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3423 const PetscInt *sizes = a->inode.size; 3424 3425 PetscFunctionBegin; 3426 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3427 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3428 cnt = 0; 3429 for (i=0, row=0; i<m; i++) { 3430 switch (sizes[i]){ 3431 case 1: 3432 x[row] = b[row]*bdiag[cnt++];row++; 3433 break; 3434 case 2: 3435 x1 = b[row]; x2 = b[row+1]; 3436 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3437 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3438 x[row++] = tmp1; 3439 x[row++] = tmp2; 3440 cnt += 4; 3441 break; 3442 case 3: 3443 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3444 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3445 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3446 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3447 x[row++] = tmp1; 3448 x[row++] = tmp2; 3449 x[row++] = tmp3; 3450 cnt += 9; 3451 break; 3452 case 4: 3453 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3454 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3455 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3456 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3457 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3458 x[row++] = tmp1; 3459 x[row++] = tmp2; 3460 x[row++] = tmp3; 3461 x[row++] = tmp4; 3462 cnt += 16; 3463 break; 3464 case 5: 3465 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3466 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3467 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3468 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3469 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3470 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3471 x[row++] = tmp1; 3472 x[row++] = tmp2; 3473 x[row++] = tmp3; 3474 x[row++] = tmp4; 3475 x[row++] = tmp5; 3476 cnt += 25; 3477 break; 3478 default: 3479 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3480 } 3481 } 3482 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3483 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3484 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3485 PetscFunctionReturn(0); 3486 } 3487 3488 /* 3489 samestructure indicates that the matrix has not changed its nonzero structure so we 3490 do not need to recompute the inodes 3491 */ 3492 #undef __FUNCT__ 3493 #define __FUNCT__ "Mat_CheckInode" 3494 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3495 { 3496 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3497 PetscErrorCode ierr; 3498 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3499 PetscTruth flag; 3500 3501 PetscFunctionBegin; 3502 if (!a->inode.use) PetscFunctionReturn(0); 3503 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3504 3505 3506 m = A->rmap->n; 3507 if (a->inode.size) {ns = a->inode.size;} 3508 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3509 3510 i = 0; 3511 node_count = 0; 3512 idx = a->j; 3513 ii = a->i; 3514 while (i < m){ /* For each row */ 3515 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3516 /* Limits the number of elements in a node to 'a->inode.limit' */ 3517 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3518 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3519 if (nzy != nzx) break; 3520 idy += nzx; /* Same nonzero pattern */ 3521 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3522 if (!flag) break; 3523 } 3524 ns[node_count++] = blk_size; 3525 idx += blk_size*nzx; 3526 i = j; 3527 } 3528 /* If not enough inodes found,, do not use inode version of the routines */ 3529 if (!a->inode.size && m && node_count > .9*m) { 3530 ierr = PetscFree(ns);CHKERRQ(ierr); 3531 a->inode.node_count = 0; 3532 a->inode.size = PETSC_NULL; 3533 a->inode.use = PETSC_FALSE; 3534 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3535 } else { 3536 A->ops->mult = MatMult_SeqAIJ_Inode; 3537 A->ops->sor = MatSOR_SeqAIJ_Inode; 3538 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3539 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3540 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3541 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3542 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3543 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3544 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3545 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode_inplace; 3546 a->inode.node_count = node_count; 3547 a->inode.size = ns; 3548 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3549 } 3550 PetscFunctionReturn(0); 3551 } 3552 3553 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3554 PetscInt __k, *__vi; \ 3555 __vi = aj + ai[row]; \ 3556 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3557 __vi = aj + adiag[row]; \ 3558 cols[nzl] = __vi[0];\ 3559 __vi = aj + adiag[row+1]+1;\ 3560 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3561 3562 3563 /* 3564 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3565 Modified from Mat_CheckInode(). 3566 3567 Input Parameters: 3568 + Mat A - ILU or LU matrix factor 3569 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3570 do not need to recompute the inodes 3571 */ 3572 #undef __FUNCT__ 3573 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3574 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3575 { 3576 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3577 PetscErrorCode ierr; 3578 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3579 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3580 PetscTruth flag; 3581 3582 PetscFunctionBegin; 3583 if (!a->inode.use) PetscFunctionReturn(0); 3584 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3585 3586 m = A->rmap->n; 3587 if (a->inode.size) {ns = a->inode.size;} 3588 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3589 3590 i = 0; 3591 node_count = 0; 3592 while (i < m){ /* For each row */ 3593 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3594 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3595 nzx = nzl1 + nzu1 + 1; 3596 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3597 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3598 3599 /* Limits the number of elements in a node to 'a->inode.limit' */ 3600 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3601 nzl2 = ai[j+1] - ai[j]; 3602 nzu2 = adiag[j] - adiag[j+1] - 1; 3603 nzy = nzl2 + nzu2 + 1; 3604 if( nzy != nzx) break; 3605 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3606 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3607 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3608 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3609 ierr = PetscFree(cols2);CHKERRQ(ierr); 3610 } 3611 ns[node_count++] = blk_size; 3612 ierr = PetscFree(cols1);CHKERRQ(ierr); 3613 i = j; 3614 } 3615 /* If not enough inodes found,, do not use inode version of the routines */ 3616 if (!a->inode.size && m && node_count > .9*m) { 3617 ierr = PetscFree(ns);CHKERRQ(ierr); 3618 a->inode.node_count = 0; 3619 a->inode.size = PETSC_NULL; 3620 a->inode.use = PETSC_FALSE; 3621 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3622 } else { 3623 A->ops->mult = 0; 3624 A->ops->sor = 0; 3625 A->ops->multadd = 0; 3626 A->ops->getrowij = 0; 3627 A->ops->restorerowij = 0; 3628 A->ops->getcolumnij = 0; 3629 A->ops->restorecolumnij = 0; 3630 A->ops->coloringpatch = 0; 3631 A->ops->multdiagonalblock = 0; 3632 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */ 3633 a->inode.node_count = node_count; 3634 a->inode.size = ns; 3635 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3636 } 3637 PetscFunctionReturn(0); 3638 } 3639 3640 /* 3641 This is really ugly. if inodes are used this replaces the 3642 permutations with ones that correspond to rows/cols of the matrix 3643 rather then inode blocks 3644 */ 3645 #undef __FUNCT__ 3646 #define __FUNCT__ "MatInodeAdjustForInodes" 3647 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3648 { 3649 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3650 3651 PetscFunctionBegin; 3652 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3653 if (f) { 3654 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3655 } 3656 PetscFunctionReturn(0); 3657 } 3658 3659 EXTERN_C_BEGIN 3660 #undef __FUNCT__ 3661 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3662 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3663 { 3664 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3665 PetscErrorCode ierr; 3666 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3667 const PetscInt *ridx,*cidx; 3668 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3669 PetscInt nslim_col,*ns_col; 3670 IS ris = *rperm,cis = *cperm; 3671 3672 PetscFunctionBegin; 3673 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3674 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3675 3676 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3677 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3678 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3679 3680 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3681 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3682 3683 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3684 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3685 3686 /* Construct the permutations for rows*/ 3687 for (i=0,row = 0; i<nslim_row; ++i){ 3688 indx = ridx[i]; 3689 start_val = tns[indx]; 3690 end_val = tns[indx + 1]; 3691 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3692 } 3693 3694 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3695 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3696 3697 /* Construct permutations for columns */ 3698 for (i=0,col=0; i<nslim_col; ++i){ 3699 indx = cidx[i]; 3700 start_val = tns[indx]; 3701 end_val = tns[indx + 1]; 3702 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3703 } 3704 3705 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3706 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3707 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3708 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3709 3710 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3711 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3712 3713 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3714 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3715 ierr = ISDestroy(cis);CHKERRQ(ierr); 3716 ierr = ISDestroy(ris);CHKERRQ(ierr); 3717 ierr = PetscFree(tns);CHKERRQ(ierr); 3718 PetscFunctionReturn(0); 3719 } 3720 EXTERN_C_END 3721 3722 #undef __FUNCT__ 3723 #define __FUNCT__ "MatInodeGetInodeSizes" 3724 /*@C 3725 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3726 3727 Collective on Mat 3728 3729 Input Parameter: 3730 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3731 3732 Output Parameter: 3733 + node_count - no of inodes present in the matrix. 3734 . sizes - an array of size node_count,with sizes of each inode. 3735 - limit - the max size used to generate the inodes. 3736 3737 Level: advanced 3738 3739 Notes: This routine returns some internal storage information 3740 of the matrix, it is intended to be used by advanced users. 3741 It should be called after the matrix is assembled. 3742 The contents of the sizes[] array should not be changed. 3743 PETSC_NULL may be passed for information not requested. 3744 3745 .keywords: matrix, seqaij, get, inode 3746 3747 .seealso: MatGetInfo() 3748 @*/ 3749 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3750 { 3751 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3752 3753 PetscFunctionBegin; 3754 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3755 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3756 if (f) { 3757 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3758 } 3759 PetscFunctionReturn(0); 3760 } 3761 3762 EXTERN_C_BEGIN 3763 #undef __FUNCT__ 3764 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3765 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3766 { 3767 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3768 3769 PetscFunctionBegin; 3770 if (node_count) *node_count = a->inode.node_count; 3771 if (sizes) *sizes = a->inode.size; 3772 if (limit) *limit = a->inode.limit; 3773 PetscFunctionReturn(0); 3774 } 3775 EXTERN_C_END 3776