1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3; 1195 FactorShiftCtx sctx; 1196 PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,*ns,col; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp2 = rtmp1 + n; 1231 rtmp3 = rtmp2 + n; 1232 ics = ic; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 if (!ns){ 1237 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1238 } 1239 1240 /* If max inode size > 3, split it into two inodes.*/ 1241 /* also map the inode sizes according to the ordering */ 1242 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1243 for (i=0,j=0; i<node_max; ++i,++j){ 1244 if (ns[i]>3) { 1245 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1246 ++j; 1247 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1248 } else { 1249 tmp_vec1[j] = ns[i]; 1250 } 1251 } 1252 /* Use the correct node_max */ 1253 node_max = j; 1254 1255 /* Now reorder the inode info based on mat re-ordering info */ 1256 /* First create a row -> inode_size_array_index map */ 1257 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1258 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1259 for (i=0,row=0; i<node_max; i++) { 1260 nodesz = tmp_vec1[i]; 1261 for (j=0; j<nodesz; j++,row++) { 1262 nsmap[row] = i; 1263 } 1264 } 1265 /* Using nsmap, create a reordered ns structure */ 1266 for (i=0,j=0; i< node_max; i++) { 1267 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1268 tmp_vec2[i] = nodesz; 1269 j += nodesz; 1270 } 1271 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1272 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1273 1274 /* Now use the correct ns */ 1275 ns = tmp_vec2; 1276 1277 do { 1278 sctx.useshift = PETSC_FALSE; 1279 /* Now loop over each block-row, and do the factorization */ 1280 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1281 nodesz = ns[inod]; 1282 1283 switch (nodesz){ 1284 case 1: 1285 /*----------*/ 1286 /* zero rtmp1 */ 1287 /* L part */ 1288 nz = bi[i+1] - bi[i]; 1289 bjtmp = bj + bi[i]; 1290 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1291 1292 /* U part */ 1293 nz = bdiag[i]-bdiag[i+1]; 1294 bjtmp = bj + bdiag[i+1]+1; 1295 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1296 1297 /* load in initial (unfactored row) */ 1298 nz = ai[r[i]+1] - ai[r[i]]; 1299 ajtmp = aj + ai[r[i]]; 1300 v = aa + ai[r[i]]; 1301 for (j=0; j<nz; j++) { 1302 rtmp1[ics[ajtmp[j]]] = v[j]; 1303 } 1304 /* ZeropivotApply() */ 1305 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1306 1307 /* elimination */ 1308 bjtmp = bj + bi[i]; 1309 row = *bjtmp++; 1310 nzL = bi[i+1] - bi[i]; 1311 for(k=0; k < nzL;k++) { 1312 pc = rtmp1 + row; 1313 if (*pc != 0.0) { 1314 pv = b->a + bdiag[row]; 1315 mul1 = *pc * (*pv); 1316 *pc = mul1; 1317 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1318 pv = b->a + bdiag[row+1]+1; 1319 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1320 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1321 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1322 } 1323 row = *bjtmp++; 1324 } 1325 1326 /* finished row so stick it into b->a */ 1327 rs = 0.0; 1328 /* L part */ 1329 pv = b->a + bi[i] ; 1330 pj = b->j + bi[i] ; 1331 nz = bi[i+1] - bi[i]; 1332 for (j=0; j<nz; j++) { 1333 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1334 } 1335 1336 /* U part */ 1337 pv = b->a + bdiag[i+1]+1; 1338 pj = b->j + bdiag[i+1]+1; 1339 nz = bdiag[i] - bdiag[i+1]-1; 1340 for (j=0; j<nz; j++) { 1341 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* Check zero pivot */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp1[i]; 1347 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1348 1349 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1350 pv = b->a + bdiag[i]; 1351 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1352 break; 1353 1354 case 2: 1355 /*----------*/ 1356 /* zero rtmp1 and rtmp2 */ 1357 /* L part */ 1358 nz = bi[i+1] - bi[i]; 1359 bjtmp = bj + bi[i]; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* U part */ 1366 nz = bdiag[i]-bdiag[i+1]; 1367 bjtmp = bj + bdiag[i+1]+1; 1368 for (j=0; j<nz; j++) { 1369 col = bjtmp[j]; 1370 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1371 } 1372 1373 /* load in initial (unfactored row) */ 1374 nz = ai[r[i]+1] - ai[r[i]]; 1375 ajtmp = aj + ai[r[i]]; 1376 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1377 for (j=0; j<nz; j++) { 1378 col = ics[ajtmp[j]]; 1379 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1380 } 1381 /* ZeropivotApply(): shift the diagonal of the matrix */ 1382 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1383 1384 /* elimination */ 1385 bjtmp = bj + bi[i]; 1386 row = *bjtmp++; /* pivot row */ 1387 nzL = bi[i+1] - bi[i]; 1388 for(k=0; k < nzL;k++) { 1389 pc1 = rtmp1 + row; 1390 pc2 = rtmp2 + row; 1391 if (*pc1 != 0.0 || *pc2 != 0.0) { 1392 pv = b->a + bdiag[row]; 1393 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1394 *pc1 = mul1; *pc2 = mul2; 1395 1396 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1397 pv = b->a + bdiag[row+1]+1; 1398 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1399 for (j=0; j<nz; j++){ 1400 col = pj[j]; 1401 rtmp1[col] -= mul1 * pv[j]; 1402 rtmp2[col] -= mul2 * pv[j]; 1403 } 1404 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1405 } 1406 row = *bjtmp++; 1407 } 1408 1409 /* finished row i; check zero pivot, then stick row i into b->a */ 1410 rs = 0.0; 1411 /* L part */ 1412 pc1 = b->a + bi[i]; 1413 pj = b->j + bi[i] ; 1414 nz = bi[i+1] - bi[i]; 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 /* U part */ 1420 pc1 = b->a + bdiag[i+1]+1; 1421 pj = b->j + bdiag[i+1]+1; 1422 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1423 for (j=0; j<nz; j++) { 1424 col = pj[j]; 1425 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1426 } 1427 1428 sctx.rs = rs; 1429 sctx.pv = rtmp1[i]; 1430 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1431 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1432 *pc1 = 1.0/sctx.pv; 1433 1434 /* Now take care of diagonal 2x2 block. */ 1435 pc2 = rtmp2 + i; 1436 if (*pc2 != 0.0){ 1437 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1438 *pc2 = mul1; /* insert L entry */ 1439 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1440 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1441 for (j=0; j<nz; j++) { 1442 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1443 } 1444 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1445 } 1446 1447 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1448 rs = 0.0; 1449 /* L part */ 1450 pc2 = b->a + bi[i+1]; 1451 pj = b->j + bi[i+1] ; 1452 nz = bi[i+2] - bi[i+1]; 1453 for (j=0; j<nz; j++) { 1454 col = pj[j]; 1455 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1456 } 1457 /* U part */ 1458 pc2 = b->a + bdiag[i+2]+1; 1459 pj = b->j + bdiag[i+2]+1; 1460 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1461 for (j=0; j<nz; j++) { 1462 col = pj[j]; 1463 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1464 } 1465 1466 sctx.rs = rs; 1467 sctx.pv = rtmp2[i+1]; 1468 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1469 pc2 = b->a + bdiag[i+1]; 1470 *pc2 = 1.0/sctx.pv; 1471 break; 1472 1473 case 3: 1474 /*----------*/ 1475 /* zero rtmp */ 1476 /* L part */ 1477 nz = bi[i+1] - bi[i]; 1478 bjtmp = bj + bi[i]; 1479 for (j=0; j<nz; j++) { 1480 col = bjtmp[j]; 1481 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1482 } 1483 1484 /* U part */ 1485 nz = bdiag[i]-bdiag[i+1]; 1486 bjtmp = bj + bdiag[i+1]+1; 1487 for (j=0; j<nz; j++) { 1488 col = bjtmp[j]; 1489 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1490 } 1491 1492 /* load in initial (unfactored row) */ 1493 nz = ai[r[i]+1] - ai[r[i]]; 1494 ajtmp = aj + ai[r[i]]; 1495 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1496 for (j=0; j<nz; j++) { 1497 col = ics[ajtmp[j]]; 1498 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1499 } 1500 /* ZeropivotApply(): shift the diagonal of the matrix */ 1501 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1502 1503 /* elimination */ 1504 bjtmp = bj + bi[i]; 1505 row = *bjtmp++; /* pivot row */ 1506 nzL = bi[i+1] - bi[i]; 1507 for(k=0; k < nzL;k++) { 1508 pc1 = rtmp1 + row; 1509 pc2 = rtmp2 + row; 1510 pc3 = rtmp3 + row; 1511 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1512 pv = b->a + bdiag[row]; 1513 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1514 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1515 1516 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1517 pv = b->a + bdiag[row+1]+1; 1518 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1519 for (j=0; j<nz; j++){ 1520 col = pj[j]; 1521 rtmp1[col] -= mul1 * pv[j]; 1522 rtmp2[col] -= mul2 * pv[j]; 1523 rtmp3[col] -= mul3 * pv[j]; 1524 } 1525 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1526 } 1527 row = *bjtmp++; 1528 } 1529 1530 /* finished row i; check zero pivot, then stick row i into b->a */ 1531 rs = 0.0; 1532 /* L part */ 1533 pc1 = b->a + bi[i]; 1534 pj = b->j + bi[i] ; 1535 nz = bi[i+1] - bi[i]; 1536 for (j=0; j<nz; j++) { 1537 col = pj[j]; 1538 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1539 } 1540 /* U part */ 1541 pc1 = b->a + bdiag[i+1]+1; 1542 pj = b->j + bdiag[i+1]+1; 1543 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1544 for (j=0; j<nz; j++) { 1545 col = pj[j]; 1546 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1547 } 1548 1549 sctx.rs = rs; 1550 sctx.pv = rtmp1[i]; 1551 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1552 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1553 *pc1 = 1.0/sctx.pv; 1554 1555 /* Now take care of 1st column of diagonal 3x3 block. */ 1556 pc2 = rtmp2 + i; 1557 pc3 = rtmp3 + i; 1558 if (*pc2 != 0.0 || *pc3 != 0.0){ 1559 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1560 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1561 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1562 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1563 for (j=0; j<nz; j++) { 1564 col = pj[j]; 1565 rtmp2[col] -= mul2 * rtmp1[col]; 1566 rtmp3[col] -= mul3 * rtmp1[col]; 1567 } 1568 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1569 } 1570 1571 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1572 rs = 0.0; 1573 /* L part */ 1574 pc2 = b->a + bi[i+1]; 1575 pj = b->j + bi[i+1] ; 1576 nz = bi[i+2] - bi[i+1]; 1577 for (j=0; j<nz; j++) { 1578 col = pj[j]; 1579 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1580 } 1581 /* U part */ 1582 pc2 = b->a + bdiag[i+2]+1; 1583 pj = b->j + bdiag[i+2]+1; 1584 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1585 for (j=0; j<nz; j++) { 1586 col = pj[j]; 1587 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1588 } 1589 1590 sctx.rs = rs; 1591 sctx.pv = rtmp2[i+1]; 1592 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1593 pc2 = b->a + bdiag[i+1]; 1594 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1595 1596 /* Now take care of 2nd column of diagonal 3x3 block. */ 1597 pc3 = rtmp3 + i+1; 1598 if (*pc3 != 0.0){ 1599 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1600 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1601 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1602 for (j=0; j<nz; j++) { 1603 col = pj[j]; 1604 rtmp3[col] -= mul3 * rtmp2[col]; 1605 } 1606 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1607 } 1608 1609 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1610 rs = 0.0; 1611 /* L part */ 1612 pc3 = b->a + bi[i+2]; 1613 pj = b->j + bi[i+2] ; 1614 nz = bi[i+3] - bi[i+2]; 1615 for (j=0; j<nz; j++) { 1616 col = pj[j]; 1617 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1618 } 1619 /* U part */ 1620 pc3 = b->a + bdiag[i+3]+1; 1621 pj = b->j + bdiag[i+3]+1; 1622 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1623 for (j=0; j<nz; j++) { 1624 col = pj[j]; 1625 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1626 } 1627 1628 sctx.rs = rs; 1629 sctx.pv = rtmp3[i+2]; 1630 ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr); 1631 pc3 = b->a + bdiag[i+2]; 1632 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1633 break; 1634 1635 default: 1636 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1637 } 1638 i += nodesz; /* Update the row */ 1639 } 1640 1641 /* MatPivotRefine() */ 1642 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1643 /* 1644 * if no shift in this attempt & shifting & started shifting & can refine, 1645 * then try lower shift 1646 */ 1647 sctx.shift_hi = sctx.shift_fraction; 1648 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1649 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1650 sctx.useshift = PETSC_TRUE; 1651 sctx.nshift++; 1652 } 1653 } while (sctx.useshift); 1654 1655 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1656 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1657 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1658 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1659 1660 C->ops->solve = MatSolve_SeqAIJ_Inode; 1661 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1662 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1663 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1664 C->ops->matsolve = MatMatSolve_SeqAIJ; 1665 C->assembled = PETSC_TRUE; 1666 C->preallocated = PETSC_TRUE; 1667 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1668 1669 /* MatShiftView(A,info,&sctx) */ 1670 if (sctx.nshift){ 1671 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1672 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1673 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1674 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1675 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1676 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1677 } 1678 } 1679 PetscFunctionReturn(0); 1680 } 1681 1682 #undef __FUNCT__ 1683 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1684 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1685 { 1686 Mat C = B; 1687 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1688 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1689 PetscErrorCode ierr; 1690 const PetscInt *r,*ic,*c,*ics; 1691 PetscInt n = A->rmap->n,*bi = b->i; 1692 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1693 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1694 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1695 PetscScalar mul1,mul2,mul3,tmp; 1696 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1697 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1698 PetscReal rs=0.0; 1699 FactorShiftCtx sctx; 1700 PetscInt newshift; 1701 1702 PetscFunctionBegin; 1703 sctx.shift_top = 0; 1704 sctx.nshift_max = 0; 1705 sctx.shift_lo = 0; 1706 sctx.shift_hi = 0; 1707 sctx.shift_fraction = 0; 1708 1709 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1710 if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1711 sctx.shift_top = 0; 1712 for (i=0; i<n; i++) { 1713 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1714 rs = 0.0; 1715 ajtmp = aj + ai[i]; 1716 rtmp1 = aa + ai[i]; 1717 nz = ai[i+1] - ai[i]; 1718 for (j=0; j<nz; j++){ 1719 if (*ajtmp != i){ 1720 rs += PetscAbsScalar(*rtmp1++); 1721 } else { 1722 rs -= PetscRealPart(*rtmp1++); 1723 } 1724 ajtmp++; 1725 } 1726 if (rs>sctx.shift_top) sctx.shift_top = rs; 1727 } 1728 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1729 sctx.shift_top *= 1.1; 1730 sctx.nshift_max = 5; 1731 sctx.shift_lo = 0.; 1732 sctx.shift_hi = 1.; 1733 } 1734 sctx.shift_amount = 0; 1735 sctx.nshift = 0; 1736 1737 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1738 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1739 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1740 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1741 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1742 ics = ic ; 1743 rtmp22 = rtmp11 + n; 1744 rtmp33 = rtmp22 + n; 1745 1746 node_max = a->inode.node_count; 1747 ns = a->inode.size; 1748 if (!ns){ 1749 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1750 } 1751 1752 /* If max inode size > 3, split it into two inodes.*/ 1753 /* also map the inode sizes according to the ordering */ 1754 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1755 for (i=0,j=0; i<node_max; ++i,++j){ 1756 if (ns[i]>3) { 1757 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1758 ++j; 1759 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1760 } else { 1761 tmp_vec1[j] = ns[i]; 1762 } 1763 } 1764 /* Use the correct node_max */ 1765 node_max = j; 1766 1767 /* Now reorder the inode info based on mat re-ordering info */ 1768 /* First create a row -> inode_size_array_index map */ 1769 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1770 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1771 for (i=0,row=0; i<node_max; i++) { 1772 nodesz = tmp_vec1[i]; 1773 for (j=0; j<nodesz; j++,row++) { 1774 nsmap[row] = i; 1775 } 1776 } 1777 /* Using nsmap, create a reordered ns structure */ 1778 for (i=0,j=0; i< node_max; i++) { 1779 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1780 tmp_vec2[i] = nodesz; 1781 j += nodesz; 1782 } 1783 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1784 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1785 /* Now use the correct ns */ 1786 ns = tmp_vec2; 1787 1788 do { 1789 sctx.useshift = PETSC_FALSE; 1790 /* Now loop over each block-row, and do the factorization */ 1791 for (i=0,row=0; i<node_max; i++) { 1792 nodesz = ns[i]; 1793 nz = bi[row+1] - bi[row]; 1794 bjtmp = bj + bi[row]; 1795 1796 switch (nodesz){ 1797 case 1: 1798 for (j=0; j<nz; j++){ 1799 idx = bjtmp[j]; 1800 rtmp11[idx] = 0.0; 1801 } 1802 1803 /* load in initial (unfactored row) */ 1804 idx = r[row]; 1805 nz_tmp = ai[idx+1] - ai[idx]; 1806 ajtmp = aj + ai[idx]; 1807 v1 = aa + ai[idx]; 1808 1809 for (j=0; j<nz_tmp; j++) { 1810 idx = ics[ajtmp[j]]; 1811 rtmp11[idx] = v1[j]; 1812 } 1813 rtmp11[ics[r[row]]] += sctx.shift_amount; 1814 1815 prow = *bjtmp++ ; 1816 while (prow < row) { 1817 pc1 = rtmp11 + prow; 1818 if (*pc1 != 0.0){ 1819 pv = ba + bd[prow]; 1820 pj = nbj + bd[prow]; 1821 mul1 = *pc1 * *pv++; 1822 *pc1 = mul1; 1823 nz_tmp = bi[prow+1] - bd[prow] - 1; 1824 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1825 for (j=0; j<nz_tmp; j++) { 1826 tmp = pv[j]; 1827 idx = pj[j]; 1828 rtmp11[idx] -= mul1 * tmp; 1829 } 1830 } 1831 prow = *bjtmp++ ; 1832 } 1833 pj = bj + bi[row]; 1834 pc1 = ba + bi[row]; 1835 1836 sctx.pv = rtmp11[row]; 1837 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1838 rs = 0.0; 1839 for (j=0; j<nz; j++) { 1840 idx = pj[j]; 1841 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1842 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1843 } 1844 sctx.rs = rs; 1845 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1846 if (newshift == 1) goto endofwhile; 1847 break; 1848 1849 case 2: 1850 for (j=0; j<nz; j++) { 1851 idx = bjtmp[j]; 1852 rtmp11[idx] = 0.0; 1853 rtmp22[idx] = 0.0; 1854 } 1855 1856 /* load in initial (unfactored row) */ 1857 idx = r[row]; 1858 nz_tmp = ai[idx+1] - ai[idx]; 1859 ajtmp = aj + ai[idx]; 1860 v1 = aa + ai[idx]; 1861 v2 = aa + ai[idx+1]; 1862 for (j=0; j<nz_tmp; j++) { 1863 idx = ics[ajtmp[j]]; 1864 rtmp11[idx] = v1[j]; 1865 rtmp22[idx] = v2[j]; 1866 } 1867 rtmp11[ics[r[row]]] += sctx.shift_amount; 1868 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1869 1870 prow = *bjtmp++ ; 1871 while (prow < row) { 1872 pc1 = rtmp11 + prow; 1873 pc2 = rtmp22 + prow; 1874 if (*pc1 != 0.0 || *pc2 != 0.0){ 1875 pv = ba + bd[prow]; 1876 pj = nbj + bd[prow]; 1877 mul1 = *pc1 * *pv; 1878 mul2 = *pc2 * *pv; 1879 ++pv; 1880 *pc1 = mul1; 1881 *pc2 = mul2; 1882 1883 nz_tmp = bi[prow+1] - bd[prow] - 1; 1884 for (j=0; j<nz_tmp; j++) { 1885 tmp = pv[j]; 1886 idx = pj[j]; 1887 rtmp11[idx] -= mul1 * tmp; 1888 rtmp22[idx] -= mul2 * tmp; 1889 } 1890 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1891 } 1892 prow = *bjtmp++ ; 1893 } 1894 1895 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1896 pc1 = rtmp11 + prow; 1897 pc2 = rtmp22 + prow; 1898 1899 sctx.pv = *pc1; 1900 pj = bj + bi[prow]; 1901 rs = 0.0; 1902 for (j=0; j<nz; j++){ 1903 idx = pj[j]; 1904 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1905 } 1906 sctx.rs = rs; 1907 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1908 if (newshift == 1) goto endofwhile; 1909 1910 if (*pc2 != 0.0){ 1911 pj = nbj + bd[prow]; 1912 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1913 *pc2 = mul2; 1914 nz_tmp = bi[prow+1] - bd[prow] - 1; 1915 for (j=0; j<nz_tmp; j++) { 1916 idx = pj[j] ; 1917 tmp = rtmp11[idx]; 1918 rtmp22[idx] -= mul2 * tmp; 1919 } 1920 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1921 } 1922 1923 pj = bj + bi[row]; 1924 pc1 = ba + bi[row]; 1925 pc2 = ba + bi[row+1]; 1926 1927 sctx.pv = rtmp22[row+1]; 1928 rs = 0.0; 1929 rtmp11[row] = 1.0/rtmp11[row]; 1930 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1931 /* copy row entries from dense representation to sparse */ 1932 for (j=0; j<nz; j++) { 1933 idx = pj[j]; 1934 pc1[j] = rtmp11[idx]; 1935 pc2[j] = rtmp22[idx]; 1936 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1937 } 1938 sctx.rs = rs; 1939 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1940 if (newshift == 1) goto endofwhile; 1941 break; 1942 1943 case 3: 1944 for (j=0; j<nz; j++) { 1945 idx = bjtmp[j]; 1946 rtmp11[idx] = 0.0; 1947 rtmp22[idx] = 0.0; 1948 rtmp33[idx] = 0.0; 1949 } 1950 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1951 idx = r[row]; 1952 nz_tmp = ai[idx+1] - ai[idx]; 1953 ajtmp = aj + ai[idx]; 1954 v1 = aa + ai[idx]; 1955 v2 = aa + ai[idx+1]; 1956 v3 = aa + ai[idx+2]; 1957 for (j=0; j<nz_tmp; j++) { 1958 idx = ics[ajtmp[j]]; 1959 rtmp11[idx] = v1[j]; 1960 rtmp22[idx] = v2[j]; 1961 rtmp33[idx] = v3[j]; 1962 } 1963 rtmp11[ics[r[row]]] += sctx.shift_amount; 1964 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1965 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1966 1967 /* loop over all pivot row blocks above this row block */ 1968 prow = *bjtmp++ ; 1969 while (prow < row) { 1970 pc1 = rtmp11 + prow; 1971 pc2 = rtmp22 + prow; 1972 pc3 = rtmp33 + prow; 1973 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1974 pv = ba + bd[prow]; 1975 pj = nbj + bd[prow]; 1976 mul1 = *pc1 * *pv; 1977 mul2 = *pc2 * *pv; 1978 mul3 = *pc3 * *pv; 1979 ++pv; 1980 *pc1 = mul1; 1981 *pc2 = mul2; 1982 *pc3 = mul3; 1983 1984 nz_tmp = bi[prow+1] - bd[prow] - 1; 1985 /* update this row based on pivot row */ 1986 for (j=0; j<nz_tmp; j++) { 1987 tmp = pv[j]; 1988 idx = pj[j]; 1989 rtmp11[idx] -= mul1 * tmp; 1990 rtmp22[idx] -= mul2 * tmp; 1991 rtmp33[idx] -= mul3 * tmp; 1992 } 1993 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1994 } 1995 prow = *bjtmp++ ; 1996 } 1997 1998 /* Now take care of diagonal 3x3 block in this set of rows */ 1999 /* note: prow = row here */ 2000 pc1 = rtmp11 + prow; 2001 pc2 = rtmp22 + prow; 2002 pc3 = rtmp33 + prow; 2003 2004 sctx.pv = *pc1; 2005 pj = bj + bi[prow]; 2006 rs = 0.0; 2007 for (j=0; j<nz; j++){ 2008 idx = pj[j]; 2009 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2010 } 2011 sctx.rs = rs; 2012 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2013 if (newshift == 1) goto endofwhile; 2014 2015 if (*pc2 != 0.0 || *pc3 != 0.0){ 2016 mul2 = (*pc2)/(*pc1); 2017 mul3 = (*pc3)/(*pc1); 2018 *pc2 = mul2; 2019 *pc3 = mul3; 2020 nz_tmp = bi[prow+1] - bd[prow] - 1; 2021 pj = nbj + bd[prow]; 2022 for (j=0; j<nz_tmp; j++) { 2023 idx = pj[j] ; 2024 tmp = rtmp11[idx]; 2025 rtmp22[idx] -= mul2 * tmp; 2026 rtmp33[idx] -= mul3 * tmp; 2027 } 2028 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2029 } 2030 ++prow; 2031 2032 pc2 = rtmp22 + prow; 2033 pc3 = rtmp33 + prow; 2034 sctx.pv = *pc2; 2035 pj = bj + bi[prow]; 2036 rs = 0.0; 2037 for (j=0; j<nz; j++){ 2038 idx = pj[j]; 2039 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2040 } 2041 sctx.rs = rs; 2042 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2043 if (newshift == 1) goto endofwhile; 2044 2045 if (*pc3 != 0.0){ 2046 mul3 = (*pc3)/(*pc2); 2047 *pc3 = mul3; 2048 pj = nbj + bd[prow]; 2049 nz_tmp = bi[prow+1] - bd[prow] - 1; 2050 for (j=0; j<nz_tmp; j++) { 2051 idx = pj[j] ; 2052 tmp = rtmp22[idx]; 2053 rtmp33[idx] -= mul3 * tmp; 2054 } 2055 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2056 } 2057 2058 pj = bj + bi[row]; 2059 pc1 = ba + bi[row]; 2060 pc2 = ba + bi[row+1]; 2061 pc3 = ba + bi[row+2]; 2062 2063 sctx.pv = rtmp33[row+2]; 2064 rs = 0.0; 2065 rtmp11[row] = 1.0/rtmp11[row]; 2066 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2067 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2068 /* copy row entries from dense representation to sparse */ 2069 for (j=0; j<nz; j++) { 2070 idx = pj[j]; 2071 pc1[j] = rtmp11[idx]; 2072 pc2[j] = rtmp22[idx]; 2073 pc3[j] = rtmp33[idx]; 2074 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2075 } 2076 2077 sctx.rs = rs; 2078 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2079 if (newshift == 1) goto endofwhile; 2080 break; 2081 2082 default: 2083 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2084 } 2085 row += nodesz; /* Update the row */ 2086 } 2087 endofwhile:; 2088 } while (sctx.useshift); 2089 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2090 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2091 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2092 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2093 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2094 (B)->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 2095 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2096 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2097 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2098 C->assembled = PETSC_TRUE; 2099 C->preallocated = PETSC_TRUE; 2100 if (sctx.nshift) { 2101 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 2102 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2103 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 2104 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2105 } 2106 } 2107 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2108 PetscFunctionReturn(0); 2109 } 2110 2111 2112 /* ----------------------------------------------------------- */ 2113 #undef __FUNCT__ 2114 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2115 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2116 { 2117 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2118 IS iscol = a->col,isrow = a->row; 2119 PetscErrorCode ierr; 2120 const PetscInt *r,*c,*rout,*cout; 2121 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2122 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2123 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2124 PetscScalar sum1,sum2,sum3,sum4,sum5; 2125 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2126 const PetscScalar *b; 2127 2128 PetscFunctionBegin; 2129 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2130 node_max = a->inode.node_count; 2131 ns = a->inode.size; /* Node Size array */ 2132 2133 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2134 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2135 tmp = a->solve_work; 2136 2137 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2138 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2139 2140 /* forward solve the lower triangular */ 2141 tmps = tmp ; 2142 aa = a_a ; 2143 aj = a_j ; 2144 ad = a->diag; 2145 2146 for (i = 0,row = 0; i< node_max; ++i){ 2147 nsz = ns[i]; 2148 aii = ai[row]; 2149 v1 = aa + aii; 2150 vi = aj + aii; 2151 nz = ai[row+1]- ai[row]; 2152 2153 if (i < node_max-1) { 2154 /* Prefetch the indices for the next block */ 2155 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2156 /* Prefetch the data for the next block */ 2157 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2158 } 2159 2160 switch (nsz){ /* Each loop in 'case' is unrolled */ 2161 case 1 : 2162 sum1 = b[r[row]]; 2163 for(j=0; j<nz-1; j+=2){ 2164 i0 = vi[j]; 2165 i1 = vi[j+1]; 2166 tmp0 = tmps[i0]; 2167 tmp1 = tmps[i1]; 2168 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2169 } 2170 if(j == nz-1){ 2171 tmp0 = tmps[vi[j]]; 2172 sum1 -= v1[j]*tmp0; 2173 } 2174 tmp[row++]=sum1; 2175 break; 2176 case 2: 2177 sum1 = b[r[row]]; 2178 sum2 = b[r[row+1]]; 2179 v2 = aa + ai[row+1]; 2180 2181 for(j=0; j<nz-1; j+=2){ 2182 i0 = vi[j]; 2183 i1 = vi[j+1]; 2184 tmp0 = tmps[i0]; 2185 tmp1 = tmps[i1]; 2186 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2187 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2188 } 2189 if(j == nz-1){ 2190 tmp0 = tmps[vi[j]]; 2191 sum1 -= v1[j] *tmp0; 2192 sum2 -= v2[j] *tmp0; 2193 } 2194 sum2 -= v2[nz] * sum1; 2195 tmp[row ++]=sum1; 2196 tmp[row ++]=sum2; 2197 break; 2198 case 3: 2199 sum1 = b[r[row]]; 2200 sum2 = b[r[row+1]]; 2201 sum3 = b[r[row+2]]; 2202 v2 = aa + ai[row+1]; 2203 v3 = aa + ai[row+2]; 2204 2205 for (j=0; j<nz-1; j+=2){ 2206 i0 = vi[j]; 2207 i1 = vi[j+1]; 2208 tmp0 = tmps[i0]; 2209 tmp1 = tmps[i1]; 2210 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2211 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2212 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2213 } 2214 if (j == nz-1){ 2215 tmp0 = tmps[vi[j]]; 2216 sum1 -= v1[j] *tmp0; 2217 sum2 -= v2[j] *tmp0; 2218 sum3 -= v3[j] *tmp0; 2219 } 2220 sum2 -= v2[nz] * sum1; 2221 sum3 -= v3[nz] * sum1; 2222 sum3 -= v3[nz+1] * sum2; 2223 tmp[row ++]=sum1; 2224 tmp[row ++]=sum2; 2225 tmp[row ++]=sum3; 2226 break; 2227 2228 case 4: 2229 sum1 = b[r[row]]; 2230 sum2 = b[r[row+1]]; 2231 sum3 = b[r[row+2]]; 2232 sum4 = b[r[row+3]]; 2233 v2 = aa + ai[row+1]; 2234 v3 = aa + ai[row+2]; 2235 v4 = aa + ai[row+3]; 2236 2237 for (j=0; j<nz-1; j+=2){ 2238 i0 = vi[j]; 2239 i1 = vi[j+1]; 2240 tmp0 = tmps[i0]; 2241 tmp1 = tmps[i1]; 2242 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2243 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2244 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2245 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2246 } 2247 if (j == nz-1){ 2248 tmp0 = tmps[vi[j]]; 2249 sum1 -= v1[j] *tmp0; 2250 sum2 -= v2[j] *tmp0; 2251 sum3 -= v3[j] *tmp0; 2252 sum4 -= v4[j] *tmp0; 2253 } 2254 sum2 -= v2[nz] * sum1; 2255 sum3 -= v3[nz] * sum1; 2256 sum4 -= v4[nz] * sum1; 2257 sum3 -= v3[nz+1] * sum2; 2258 sum4 -= v4[nz+1] * sum2; 2259 sum4 -= v4[nz+2] * sum3; 2260 2261 tmp[row ++]=sum1; 2262 tmp[row ++]=sum2; 2263 tmp[row ++]=sum3; 2264 tmp[row ++]=sum4; 2265 break; 2266 case 5: 2267 sum1 = b[r[row]]; 2268 sum2 = b[r[row+1]]; 2269 sum3 = b[r[row+2]]; 2270 sum4 = b[r[row+3]]; 2271 sum5 = b[r[row+4]]; 2272 v2 = aa + ai[row+1]; 2273 v3 = aa + ai[row+2]; 2274 v4 = aa + ai[row+3]; 2275 v5 = aa + ai[row+4]; 2276 2277 for (j=0; j<nz-1; j+=2){ 2278 i0 = vi[j]; 2279 i1 = vi[j+1]; 2280 tmp0 = tmps[i0]; 2281 tmp1 = tmps[i1]; 2282 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2283 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2284 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2285 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2286 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2287 } 2288 if (j == nz-1){ 2289 tmp0 = tmps[vi[j]]; 2290 sum1 -= v1[j] *tmp0; 2291 sum2 -= v2[j] *tmp0; 2292 sum3 -= v3[j] *tmp0; 2293 sum4 -= v4[j] *tmp0; 2294 sum5 -= v5[j] *tmp0; 2295 } 2296 2297 sum2 -= v2[nz] * sum1; 2298 sum3 -= v3[nz] * sum1; 2299 sum4 -= v4[nz] * sum1; 2300 sum5 -= v5[nz] * sum1; 2301 sum3 -= v3[nz+1] * sum2; 2302 sum4 -= v4[nz+1] * sum2; 2303 sum5 -= v5[nz+1] * sum2; 2304 sum4 -= v4[nz+2] * sum3; 2305 sum5 -= v5[nz+2] * sum3; 2306 sum5 -= v5[nz+3] * sum4; 2307 2308 tmp[row ++]=sum1; 2309 tmp[row ++]=sum2; 2310 tmp[row ++]=sum3; 2311 tmp[row ++]=sum4; 2312 tmp[row ++]=sum5; 2313 break; 2314 default: 2315 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2316 } 2317 } 2318 /* backward solve the upper triangular */ 2319 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2320 nsz = ns[i]; 2321 aii = ad[row+1] + 1; 2322 v1 = aa + aii; 2323 vi = aj + aii; 2324 nz = ad[row]- ad[row+1] - 1; 2325 2326 if (i > 0) { 2327 /* Prefetch the indices for the next block */ 2328 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2329 /* Prefetch the data for the next block */ 2330 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2331 } 2332 2333 switch (nsz){ /* Each loop in 'case' is unrolled */ 2334 case 1 : 2335 sum1 = tmp[row]; 2336 2337 for(j=0 ; j<nz-1; j+=2){ 2338 i0 = vi[j]; 2339 i1 = vi[j+1]; 2340 tmp0 = tmps[i0]; 2341 tmp1 = tmps[i1]; 2342 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2343 } 2344 if (j == nz-1){ 2345 tmp0 = tmps[vi[j]]; 2346 sum1 -= v1[j]*tmp0; 2347 } 2348 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2349 break; 2350 case 2 : 2351 sum1 = tmp[row]; 2352 sum2 = tmp[row-1]; 2353 v2 = aa + ad[row] + 1; 2354 for (j=0 ; j<nz-1; j+=2){ 2355 i0 = vi[j]; 2356 i1 = vi[j+1]; 2357 tmp0 = tmps[i0]; 2358 tmp1 = tmps[i1]; 2359 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2360 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2361 } 2362 if (j == nz-1){ 2363 tmp0 = tmps[vi[j]]; 2364 sum1 -= v1[j]* tmp0; 2365 sum2 -= v2[j+1]* tmp0; 2366 } 2367 2368 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2369 sum2 -= v2[0] * tmp0; 2370 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2371 break; 2372 case 3 : 2373 sum1 = tmp[row]; 2374 sum2 = tmp[row -1]; 2375 sum3 = tmp[row -2]; 2376 v2 = aa + ad[row] + 1; 2377 v3 = aa + ad[row -1] + 1; 2378 for (j=0 ; j<nz-1; j+=2){ 2379 i0 = vi[j]; 2380 i1 = vi[j+1]; 2381 tmp0 = tmps[i0]; 2382 tmp1 = tmps[i1]; 2383 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2384 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2385 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2386 } 2387 if (j== nz-1){ 2388 tmp0 = tmps[vi[j]]; 2389 sum1 -= v1[j] * tmp0; 2390 sum2 -= v2[j+1] * tmp0; 2391 sum3 -= v3[j+2] * tmp0; 2392 } 2393 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2394 sum2 -= v2[0]* tmp0; 2395 sum3 -= v3[1] * tmp0; 2396 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2397 sum3 -= v3[0]* tmp0; 2398 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2399 2400 break; 2401 case 4 : 2402 sum1 = tmp[row]; 2403 sum2 = tmp[row -1]; 2404 sum3 = tmp[row -2]; 2405 sum4 = tmp[row -3]; 2406 v2 = aa + ad[row]+1; 2407 v3 = aa + ad[row -1]+1; 2408 v4 = aa + ad[row -2]+1; 2409 2410 for (j=0 ; j<nz-1; j+=2){ 2411 i0 = vi[j]; 2412 i1 = vi[j+1]; 2413 tmp0 = tmps[i0]; 2414 tmp1 = tmps[i1]; 2415 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2416 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2417 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2418 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2419 } 2420 if (j== nz-1){ 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] * tmp0; 2423 sum2 -= v2[j+1] * tmp0; 2424 sum3 -= v3[j+2] * tmp0; 2425 sum4 -= v4[j+3] * tmp0; 2426 } 2427 2428 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2429 sum2 -= v2[0] * tmp0; 2430 sum3 -= v3[1] * tmp0; 2431 sum4 -= v4[2] * tmp0; 2432 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2433 sum3 -= v3[0] * tmp0; 2434 sum4 -= v4[1] * tmp0; 2435 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2436 sum4 -= v4[0] * tmp0; 2437 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2438 break; 2439 case 5 : 2440 sum1 = tmp[row]; 2441 sum2 = tmp[row -1]; 2442 sum3 = tmp[row -2]; 2443 sum4 = tmp[row -3]; 2444 sum5 = tmp[row -4]; 2445 v2 = aa + ad[row]+1; 2446 v3 = aa + ad[row -1]+1; 2447 v4 = aa + ad[row -2]+1; 2448 v5 = aa + ad[row -3]+1; 2449 for (j=0 ; j<nz-1; j+=2){ 2450 i0 = vi[j]; 2451 i1 = vi[j+1]; 2452 tmp0 = tmps[i0]; 2453 tmp1 = tmps[i1]; 2454 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2455 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2456 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2457 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2458 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2459 } 2460 if (j==nz-1){ 2461 tmp0 = tmps[vi[j]]; 2462 sum1 -= v1[j] * tmp0; 2463 sum2 -= v2[j+1] * tmp0; 2464 sum3 -= v3[j+2] * tmp0; 2465 sum4 -= v4[j+3] * tmp0; 2466 sum5 -= v5[j+4] * tmp0; 2467 } 2468 2469 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2470 sum2 -= v2[0] * tmp0; 2471 sum3 -= v3[1] * tmp0; 2472 sum4 -= v4[2] * tmp0; 2473 sum5 -= v5[3] * tmp0; 2474 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2475 sum3 -= v3[0] * tmp0; 2476 sum4 -= v4[1] * tmp0; 2477 sum5 -= v5[2] * tmp0; 2478 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2479 sum4 -= v4[0] * tmp0; 2480 sum5 -= v5[1] * tmp0; 2481 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2482 sum5 -= v5[0] * tmp0; 2483 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2484 break; 2485 default: 2486 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2487 } 2488 } 2489 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2490 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2491 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2492 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2493 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2494 PetscFunctionReturn(0); 2495 } 2496 2497 2498 /* 2499 Makes a longer coloring[] array and calls the usual code with that 2500 */ 2501 #undef __FUNCT__ 2502 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2503 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2504 { 2505 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2506 PetscErrorCode ierr; 2507 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2508 PetscInt *colorused,i; 2509 ISColoringValue *newcolor; 2510 2511 PetscFunctionBegin; 2512 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2513 /* loop over inodes, marking a color for each column*/ 2514 row = 0; 2515 for (i=0; i<m; i++){ 2516 for (j=0; j<ns[i]; j++) { 2517 newcolor[row++] = coloring[i] + j*ncolors; 2518 } 2519 } 2520 2521 /* eliminate unneeded colors */ 2522 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2523 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2524 for (i=0; i<n; i++) { 2525 colorused[newcolor[i]] = 1; 2526 } 2527 2528 for (i=1; i<5*ncolors; i++) { 2529 colorused[i] += colorused[i-1]; 2530 } 2531 ncolors = colorused[5*ncolors-1]; 2532 for (i=0; i<n; i++) { 2533 newcolor[i] = colorused[newcolor[i]]-1; 2534 } 2535 ierr = PetscFree(colorused);CHKERRQ(ierr); 2536 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2537 ierr = PetscFree(coloring);CHKERRQ(ierr); 2538 PetscFunctionReturn(0); 2539 } 2540 2541 #include "../src/mat/blockinvert.h" 2542 2543 #undef __FUNCT__ 2544 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2545 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2546 { 2547 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2548 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2549 MatScalar *ibdiag,*bdiag,work[25]; 2550 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2551 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2552 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2553 PetscErrorCode ierr; 2554 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2555 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2556 2557 PetscFunctionBegin; 2558 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2559 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2560 if (its > 1) { 2561 /* switch to non-inode version */ 2562 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2563 PetscFunctionReturn(0); 2564 } 2565 2566 if (!a->inode.ibdiagvalid) { 2567 if (!a->inode.ibdiag) { 2568 /* calculate space needed for diagonal blocks */ 2569 for (i=0; i<m; i++) { 2570 cnt += sizes[i]*sizes[i]; 2571 } 2572 a->inode.bdiagsize = cnt; 2573 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2574 } 2575 2576 /* copy over the diagonal blocks and invert them */ 2577 ibdiag = a->inode.ibdiag; 2578 bdiag = a->inode.bdiag; 2579 cnt = 0; 2580 for (i=0, row = 0; i<m; i++) { 2581 for (j=0; j<sizes[i]; j++) { 2582 for (k=0; k<sizes[i]; k++) { 2583 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2584 } 2585 } 2586 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2587 2588 switch(sizes[i]) { 2589 case 1: 2590 /* Create matrix data structure */ 2591 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2592 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2593 break; 2594 case 2: 2595 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2596 break; 2597 case 3: 2598 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2599 break; 2600 case 4: 2601 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2602 break; 2603 case 5: 2604 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2605 break; 2606 default: 2607 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2608 } 2609 cnt += sizes[i]*sizes[i]; 2610 row += sizes[i]; 2611 } 2612 a->inode.ibdiagvalid = PETSC_TRUE; 2613 } 2614 ibdiag = a->inode.ibdiag; 2615 bdiag = a->inode.bdiag; 2616 2617 2618 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2619 if (flag & SOR_ZERO_INITIAL_GUESS) { 2620 PetscScalar *b; 2621 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2622 if (xx != bb) { 2623 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2624 } else { 2625 b = x; 2626 } 2627 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2628 2629 for (i=0, row=0; i<m; i++) { 2630 sz = diag[row] - ii[row]; 2631 v1 = a->a + ii[row]; 2632 idx = a->j + ii[row]; 2633 2634 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2635 switch (sizes[i]){ 2636 case 1: 2637 2638 sum1 = b[row]; 2639 for(n = 0; n<sz-1; n+=2) { 2640 i1 = idx[0]; 2641 i2 = idx[1]; 2642 idx += 2; 2643 tmp0 = x[i1]; 2644 tmp1 = x[i2]; 2645 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2646 } 2647 2648 if (n == sz-1){ 2649 tmp0 = x[*idx]; 2650 sum1 -= *v1 * tmp0; 2651 } 2652 x[row++] = sum1*(*ibdiag++); 2653 break; 2654 case 2: 2655 v2 = a->a + ii[row+1]; 2656 sum1 = b[row]; 2657 sum2 = b[row+1]; 2658 for(n = 0; n<sz-1; n+=2) { 2659 i1 = idx[0]; 2660 i2 = idx[1]; 2661 idx += 2; 2662 tmp0 = x[i1]; 2663 tmp1 = x[i2]; 2664 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2665 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2666 } 2667 2668 if (n == sz-1){ 2669 tmp0 = x[*idx]; 2670 sum1 -= v1[0] * tmp0; 2671 sum2 -= v2[0] * tmp0; 2672 } 2673 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2674 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2675 ibdiag += 4; 2676 break; 2677 case 3: 2678 v2 = a->a + ii[row+1]; 2679 v3 = a->a + ii[row+2]; 2680 sum1 = b[row]; 2681 sum2 = b[row+1]; 2682 sum3 = b[row+2]; 2683 for(n = 0; n<sz-1; n+=2) { 2684 i1 = idx[0]; 2685 i2 = idx[1]; 2686 idx += 2; 2687 tmp0 = x[i1]; 2688 tmp1 = x[i2]; 2689 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2690 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2691 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2692 } 2693 2694 if (n == sz-1){ 2695 tmp0 = x[*idx]; 2696 sum1 -= v1[0] * tmp0; 2697 sum2 -= v2[0] * tmp0; 2698 sum3 -= v3[0] * tmp0; 2699 } 2700 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2701 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2702 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2703 ibdiag += 9; 2704 break; 2705 case 4: 2706 v2 = a->a + ii[row+1]; 2707 v3 = a->a + ii[row+2]; 2708 v4 = a->a + ii[row+3]; 2709 sum1 = b[row]; 2710 sum2 = b[row+1]; 2711 sum3 = b[row+2]; 2712 sum4 = b[row+3]; 2713 for(n = 0; n<sz-1; n+=2) { 2714 i1 = idx[0]; 2715 i2 = idx[1]; 2716 idx += 2; 2717 tmp0 = x[i1]; 2718 tmp1 = x[i2]; 2719 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2720 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2721 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2722 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2723 } 2724 2725 if (n == sz-1){ 2726 tmp0 = x[*idx]; 2727 sum1 -= v1[0] * tmp0; 2728 sum2 -= v2[0] * tmp0; 2729 sum3 -= v3[0] * tmp0; 2730 sum4 -= v4[0] * tmp0; 2731 } 2732 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2733 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2734 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2735 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2736 ibdiag += 16; 2737 break; 2738 case 5: 2739 v2 = a->a + ii[row+1]; 2740 v3 = a->a + ii[row+2]; 2741 v4 = a->a + ii[row+3]; 2742 v5 = a->a + ii[row+4]; 2743 sum1 = b[row]; 2744 sum2 = b[row+1]; 2745 sum3 = b[row+2]; 2746 sum4 = b[row+3]; 2747 sum5 = b[row+4]; 2748 for(n = 0; n<sz-1; n+=2) { 2749 i1 = idx[0]; 2750 i2 = idx[1]; 2751 idx += 2; 2752 tmp0 = x[i1]; 2753 tmp1 = x[i2]; 2754 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2755 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2756 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2757 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2758 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2759 } 2760 2761 if (n == sz-1){ 2762 tmp0 = x[*idx]; 2763 sum1 -= v1[0] * tmp0; 2764 sum2 -= v2[0] * tmp0; 2765 sum3 -= v3[0] * tmp0; 2766 sum4 -= v4[0] * tmp0; 2767 sum5 -= v5[0] * tmp0; 2768 } 2769 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2770 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2771 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2772 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2773 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2774 ibdiag += 25; 2775 break; 2776 default: 2777 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2778 } 2779 } 2780 2781 xb = x; 2782 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2783 } else xb = b; 2784 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2785 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2786 cnt = 0; 2787 for (i=0, row=0; i<m; i++) { 2788 2789 switch (sizes[i]){ 2790 case 1: 2791 x[row++] *= bdiag[cnt++]; 2792 break; 2793 case 2: 2794 x1 = x[row]; x2 = x[row+1]; 2795 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2796 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2797 x[row++] = tmp1; 2798 x[row++] = tmp2; 2799 cnt += 4; 2800 break; 2801 case 3: 2802 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2803 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2804 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2805 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2806 x[row++] = tmp1; 2807 x[row++] = tmp2; 2808 x[row++] = tmp3; 2809 cnt += 9; 2810 break; 2811 case 4: 2812 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2813 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2814 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2815 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2816 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2817 x[row++] = tmp1; 2818 x[row++] = tmp2; 2819 x[row++] = tmp3; 2820 x[row++] = tmp4; 2821 cnt += 16; 2822 break; 2823 case 5: 2824 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2825 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2826 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2827 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2828 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2829 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2830 x[row++] = tmp1; 2831 x[row++] = tmp2; 2832 x[row++] = tmp3; 2833 x[row++] = tmp4; 2834 x[row++] = tmp5; 2835 cnt += 25; 2836 break; 2837 default: 2838 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2839 } 2840 } 2841 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2842 } 2843 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2844 2845 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2846 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2847 ibdiag -= sizes[i]*sizes[i]; 2848 sz = ii[row+1] - diag[row] - 1; 2849 v1 = a->a + diag[row] + 1; 2850 idx = a->j + diag[row] + 1; 2851 2852 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2853 switch (sizes[i]){ 2854 case 1: 2855 2856 sum1 = xb[row]; 2857 for(n = 0; n<sz-1; n+=2) { 2858 i1 = idx[0]; 2859 i2 = idx[1]; 2860 idx += 2; 2861 tmp0 = x[i1]; 2862 tmp1 = x[i2]; 2863 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2864 } 2865 2866 if (n == sz-1){ 2867 tmp0 = x[*idx]; 2868 sum1 -= *v1*tmp0; 2869 } 2870 x[row--] = sum1*(*ibdiag); 2871 break; 2872 2873 case 2: 2874 2875 sum1 = xb[row]; 2876 sum2 = xb[row-1]; 2877 /* note that sum1 is associated with the second of the two rows */ 2878 v2 = a->a + diag[row-1] + 2; 2879 for(n = 0; n<sz-1; n+=2) { 2880 i1 = idx[0]; 2881 i2 = idx[1]; 2882 idx += 2; 2883 tmp0 = x[i1]; 2884 tmp1 = x[i2]; 2885 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2886 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2887 } 2888 2889 if (n == sz-1){ 2890 tmp0 = x[*idx]; 2891 sum1 -= *v1*tmp0; 2892 sum2 -= *v2*tmp0; 2893 } 2894 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2895 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2896 break; 2897 case 3: 2898 2899 sum1 = xb[row]; 2900 sum2 = xb[row-1]; 2901 sum3 = xb[row-2]; 2902 v2 = a->a + diag[row-1] + 2; 2903 v3 = a->a + diag[row-2] + 3; 2904 for(n = 0; n<sz-1; n+=2) { 2905 i1 = idx[0]; 2906 i2 = idx[1]; 2907 idx += 2; 2908 tmp0 = x[i1]; 2909 tmp1 = x[i2]; 2910 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2911 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2912 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2913 } 2914 2915 if (n == sz-1){ 2916 tmp0 = x[*idx]; 2917 sum1 -= *v1*tmp0; 2918 sum2 -= *v2*tmp0; 2919 sum3 -= *v3*tmp0; 2920 } 2921 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2922 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2923 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2924 break; 2925 case 4: 2926 2927 sum1 = xb[row]; 2928 sum2 = xb[row-1]; 2929 sum3 = xb[row-2]; 2930 sum4 = xb[row-3]; 2931 v2 = a->a + diag[row-1] + 2; 2932 v3 = a->a + diag[row-2] + 3; 2933 v4 = a->a + diag[row-3] + 4; 2934 for(n = 0; n<sz-1; n+=2) { 2935 i1 = idx[0]; 2936 i2 = idx[1]; 2937 idx += 2; 2938 tmp0 = x[i1]; 2939 tmp1 = x[i2]; 2940 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2941 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2942 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2943 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2944 } 2945 2946 if (n == sz-1){ 2947 tmp0 = x[*idx]; 2948 sum1 -= *v1*tmp0; 2949 sum2 -= *v2*tmp0; 2950 sum3 -= *v3*tmp0; 2951 sum4 -= *v4*tmp0; 2952 } 2953 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2954 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2955 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2956 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2957 break; 2958 case 5: 2959 2960 sum1 = xb[row]; 2961 sum2 = xb[row-1]; 2962 sum3 = xb[row-2]; 2963 sum4 = xb[row-3]; 2964 sum5 = xb[row-4]; 2965 v2 = a->a + diag[row-1] + 2; 2966 v3 = a->a + diag[row-2] + 3; 2967 v4 = a->a + diag[row-3] + 4; 2968 v5 = a->a + diag[row-4] + 5; 2969 for(n = 0; n<sz-1; n+=2) { 2970 i1 = idx[0]; 2971 i2 = idx[1]; 2972 idx += 2; 2973 tmp0 = x[i1]; 2974 tmp1 = x[i2]; 2975 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2976 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2977 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2978 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2979 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2980 } 2981 2982 if (n == sz-1){ 2983 tmp0 = x[*idx]; 2984 sum1 -= *v1*tmp0; 2985 sum2 -= *v2*tmp0; 2986 sum3 -= *v3*tmp0; 2987 sum4 -= *v4*tmp0; 2988 sum5 -= *v5*tmp0; 2989 } 2990 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2991 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2992 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2993 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2994 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2995 break; 2996 default: 2997 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2998 } 2999 } 3000 3001 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3002 } 3003 its--; 3004 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3005 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3006 } 3007 if (flag & SOR_EISENSTAT) { 3008 const PetscScalar *b; 3009 MatScalar *t = a->inode.ssor_work; 3010 3011 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3012 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3013 /* 3014 Apply (U + D)^-1 where D is now the block diagonal 3015 */ 3016 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3017 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3018 ibdiag -= sizes[i]*sizes[i]; 3019 sz = ii[row+1] - diag[row] - 1; 3020 v1 = a->a + diag[row] + 1; 3021 idx = a->j + diag[row] + 1; 3022 CHKMEMQ; 3023 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3024 switch (sizes[i]){ 3025 case 1: 3026 3027 sum1 = b[row]; 3028 for(n = 0; n<sz-1; n+=2) { 3029 i1 = idx[0]; 3030 i2 = idx[1]; 3031 idx += 2; 3032 tmp0 = x[i1]; 3033 tmp1 = x[i2]; 3034 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3035 } 3036 3037 if (n == sz-1){ 3038 tmp0 = x[*idx]; 3039 sum1 -= *v1*tmp0; 3040 } 3041 x[row] = sum1*(*ibdiag);row--; 3042 break; 3043 3044 case 2: 3045 3046 sum1 = b[row]; 3047 sum2 = b[row-1]; 3048 /* note that sum1 is associated with the second of the two rows */ 3049 v2 = a->a + diag[row-1] + 2; 3050 for(n = 0; n<sz-1; n+=2) { 3051 i1 = idx[0]; 3052 i2 = idx[1]; 3053 idx += 2; 3054 tmp0 = x[i1]; 3055 tmp1 = x[i2]; 3056 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3057 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3058 } 3059 3060 if (n == sz-1){ 3061 tmp0 = x[*idx]; 3062 sum1 -= *v1*tmp0; 3063 sum2 -= *v2*tmp0; 3064 } 3065 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3066 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3067 row -= 2; 3068 break; 3069 case 3: 3070 3071 sum1 = b[row]; 3072 sum2 = b[row-1]; 3073 sum3 = b[row-2]; 3074 v2 = a->a + diag[row-1] + 2; 3075 v3 = a->a + diag[row-2] + 3; 3076 for(n = 0; n<sz-1; n+=2) { 3077 i1 = idx[0]; 3078 i2 = idx[1]; 3079 idx += 2; 3080 tmp0 = x[i1]; 3081 tmp1 = x[i2]; 3082 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3083 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3084 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3085 } 3086 3087 if (n == sz-1){ 3088 tmp0 = x[*idx]; 3089 sum1 -= *v1*tmp0; 3090 sum2 -= *v2*tmp0; 3091 sum3 -= *v3*tmp0; 3092 } 3093 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3094 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3095 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3096 row -= 3; 3097 break; 3098 case 4: 3099 3100 sum1 = b[row]; 3101 sum2 = b[row-1]; 3102 sum3 = b[row-2]; 3103 sum4 = b[row-3]; 3104 v2 = a->a + diag[row-1] + 2; 3105 v3 = a->a + diag[row-2] + 3; 3106 v4 = a->a + diag[row-3] + 4; 3107 for(n = 0; n<sz-1; n+=2) { 3108 i1 = idx[0]; 3109 i2 = idx[1]; 3110 idx += 2; 3111 tmp0 = x[i1]; 3112 tmp1 = x[i2]; 3113 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3114 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3115 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3116 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3117 } 3118 3119 if (n == sz-1){ 3120 tmp0 = x[*idx]; 3121 sum1 -= *v1*tmp0; 3122 sum2 -= *v2*tmp0; 3123 sum3 -= *v3*tmp0; 3124 sum4 -= *v4*tmp0; 3125 } 3126 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3127 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3128 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3129 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3130 row -= 4; 3131 break; 3132 case 5: 3133 3134 sum1 = b[row]; 3135 sum2 = b[row-1]; 3136 sum3 = b[row-2]; 3137 sum4 = b[row-3]; 3138 sum5 = b[row-4]; 3139 v2 = a->a + diag[row-1] + 2; 3140 v3 = a->a + diag[row-2] + 3; 3141 v4 = a->a + diag[row-3] + 4; 3142 v5 = a->a + diag[row-4] + 5; 3143 for(n = 0; n<sz-1; n+=2) { 3144 i1 = idx[0]; 3145 i2 = idx[1]; 3146 idx += 2; 3147 tmp0 = x[i1]; 3148 tmp1 = x[i2]; 3149 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3150 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3151 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3152 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3153 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3154 } 3155 3156 if (n == sz-1){ 3157 tmp0 = x[*idx]; 3158 sum1 -= *v1*tmp0; 3159 sum2 -= *v2*tmp0; 3160 sum3 -= *v3*tmp0; 3161 sum4 -= *v4*tmp0; 3162 sum5 -= *v5*tmp0; 3163 } 3164 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3165 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3166 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3167 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3168 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3169 row -= 5; 3170 break; 3171 default: 3172 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3173 } 3174 CHKMEMQ; 3175 } 3176 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3177 3178 /* 3179 t = b - D x where D is the block diagonal 3180 */ 3181 cnt = 0; 3182 for (i=0, row=0; i<m; i++) { 3183 CHKMEMQ; 3184 switch (sizes[i]){ 3185 case 1: 3186 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3187 break; 3188 case 2: 3189 x1 = x[row]; x2 = x[row+1]; 3190 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3191 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3192 t[row] = b[row] - tmp1; 3193 t[row+1] = b[row+1] - tmp2; row += 2; 3194 cnt += 4; 3195 break; 3196 case 3: 3197 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3198 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3199 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3200 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3201 t[row] = b[row] - tmp1; 3202 t[row+1] = b[row+1] - tmp2; 3203 t[row+2] = b[row+2] - tmp3; row += 3; 3204 cnt += 9; 3205 break; 3206 case 4: 3207 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3208 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3209 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3210 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3211 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3212 t[row] = b[row] - tmp1; 3213 t[row+1] = b[row+1] - tmp2; 3214 t[row+2] = b[row+2] - tmp3; 3215 t[row+3] = b[row+3] - tmp4; row += 4; 3216 cnt += 16; 3217 break; 3218 case 5: 3219 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3220 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3221 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3222 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3223 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3224 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3225 t[row] = b[row] - tmp1; 3226 t[row+1] = b[row+1] - tmp2; 3227 t[row+2] = b[row+2] - tmp3; 3228 t[row+3] = b[row+3] - tmp4; 3229 t[row+4] = b[row+4] - tmp5;row += 5; 3230 cnt += 25; 3231 break; 3232 default: 3233 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3234 } 3235 CHKMEMQ; 3236 } 3237 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3238 3239 3240 3241 /* 3242 Apply (L + D)^-1 where D is the block diagonal 3243 */ 3244 for (i=0, row=0; i<m; i++) { 3245 sz = diag[row] - ii[row]; 3246 v1 = a->a + ii[row]; 3247 idx = a->j + ii[row]; 3248 CHKMEMQ; 3249 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3250 switch (sizes[i]){ 3251 case 1: 3252 3253 sum1 = t[row]; 3254 for(n = 0; n<sz-1; n+=2) { 3255 i1 = idx[0]; 3256 i2 = idx[1]; 3257 idx += 2; 3258 tmp0 = t[i1]; 3259 tmp1 = t[i2]; 3260 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3261 } 3262 3263 if (n == sz-1){ 3264 tmp0 = t[*idx]; 3265 sum1 -= *v1 * tmp0; 3266 } 3267 x[row] += t[row] = sum1*(*ibdiag++); row++; 3268 break; 3269 case 2: 3270 v2 = a->a + ii[row+1]; 3271 sum1 = t[row]; 3272 sum2 = t[row+1]; 3273 for(n = 0; n<sz-1; n+=2) { 3274 i1 = idx[0]; 3275 i2 = idx[1]; 3276 idx += 2; 3277 tmp0 = t[i1]; 3278 tmp1 = t[i2]; 3279 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3280 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3281 } 3282 3283 if (n == sz-1){ 3284 tmp0 = t[*idx]; 3285 sum1 -= v1[0] * tmp0; 3286 sum2 -= v2[0] * tmp0; 3287 } 3288 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3289 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3290 ibdiag += 4; row += 2; 3291 break; 3292 case 3: 3293 v2 = a->a + ii[row+1]; 3294 v3 = a->a + ii[row+2]; 3295 sum1 = t[row]; 3296 sum2 = t[row+1]; 3297 sum3 = t[row+2]; 3298 for(n = 0; n<sz-1; n+=2) { 3299 i1 = idx[0]; 3300 i2 = idx[1]; 3301 idx += 2; 3302 tmp0 = t[i1]; 3303 tmp1 = t[i2]; 3304 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3305 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3306 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3307 } 3308 3309 if (n == sz-1){ 3310 tmp0 = t[*idx]; 3311 sum1 -= v1[0] * tmp0; 3312 sum2 -= v2[0] * tmp0; 3313 sum3 -= v3[0] * tmp0; 3314 } 3315 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3316 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3317 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3318 ibdiag += 9; row += 3; 3319 break; 3320 case 4: 3321 v2 = a->a + ii[row+1]; 3322 v3 = a->a + ii[row+2]; 3323 v4 = a->a + ii[row+3]; 3324 sum1 = t[row]; 3325 sum2 = t[row+1]; 3326 sum3 = t[row+2]; 3327 sum4 = t[row+3]; 3328 for(n = 0; n<sz-1; n+=2) { 3329 i1 = idx[0]; 3330 i2 = idx[1]; 3331 idx += 2; 3332 tmp0 = t[i1]; 3333 tmp1 = t[i2]; 3334 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3335 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3336 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3337 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3338 } 3339 3340 if (n == sz-1){ 3341 tmp0 = t[*idx]; 3342 sum1 -= v1[0] * tmp0; 3343 sum2 -= v2[0] * tmp0; 3344 sum3 -= v3[0] * tmp0; 3345 sum4 -= v4[0] * tmp0; 3346 } 3347 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3348 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3349 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3350 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3351 ibdiag += 16; row += 4; 3352 break; 3353 case 5: 3354 v2 = a->a + ii[row+1]; 3355 v3 = a->a + ii[row+2]; 3356 v4 = a->a + ii[row+3]; 3357 v5 = a->a + ii[row+4]; 3358 sum1 = t[row]; 3359 sum2 = t[row+1]; 3360 sum3 = t[row+2]; 3361 sum4 = t[row+3]; 3362 sum5 = t[row+4]; 3363 for(n = 0; n<sz-1; n+=2) { 3364 i1 = idx[0]; 3365 i2 = idx[1]; 3366 idx += 2; 3367 tmp0 = t[i1]; 3368 tmp1 = t[i2]; 3369 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3370 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3371 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3372 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3373 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3374 } 3375 3376 if (n == sz-1){ 3377 tmp0 = t[*idx]; 3378 sum1 -= v1[0] * tmp0; 3379 sum2 -= v2[0] * tmp0; 3380 sum3 -= v3[0] * tmp0; 3381 sum4 -= v4[0] * tmp0; 3382 sum5 -= v5[0] * tmp0; 3383 } 3384 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3385 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3386 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3387 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3388 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3389 ibdiag += 25; row += 5; 3390 break; 3391 default: 3392 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3393 } 3394 CHKMEMQ; 3395 } 3396 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3397 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3398 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3399 } 3400 PetscFunctionReturn(0); 3401 } 3402 3403 #undef __FUNCT__ 3404 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3405 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3406 { 3407 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3408 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3409 const MatScalar *bdiag = a->inode.bdiag; 3410 const PetscScalar *b; 3411 PetscErrorCode ierr; 3412 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3413 const PetscInt *sizes = a->inode.size; 3414 3415 PetscFunctionBegin; 3416 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3417 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3418 cnt = 0; 3419 for (i=0, row=0; i<m; i++) { 3420 switch (sizes[i]){ 3421 case 1: 3422 x[row] = b[row]*bdiag[cnt++];row++; 3423 break; 3424 case 2: 3425 x1 = b[row]; x2 = b[row+1]; 3426 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3427 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3428 x[row++] = tmp1; 3429 x[row++] = tmp2; 3430 cnt += 4; 3431 break; 3432 case 3: 3433 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3434 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3435 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3436 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3437 x[row++] = tmp1; 3438 x[row++] = tmp2; 3439 x[row++] = tmp3; 3440 cnt += 9; 3441 break; 3442 case 4: 3443 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3444 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3445 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3446 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3447 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3448 x[row++] = tmp1; 3449 x[row++] = tmp2; 3450 x[row++] = tmp3; 3451 x[row++] = tmp4; 3452 cnt += 16; 3453 break; 3454 case 5: 3455 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3456 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3457 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3458 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3459 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3460 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3461 x[row++] = tmp1; 3462 x[row++] = tmp2; 3463 x[row++] = tmp3; 3464 x[row++] = tmp4; 3465 x[row++] = tmp5; 3466 cnt += 25; 3467 break; 3468 default: 3469 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3470 } 3471 } 3472 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3473 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3474 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3475 PetscFunctionReturn(0); 3476 } 3477 3478 /* 3479 samestructure indicates that the matrix has not changed its nonzero structure so we 3480 do not need to recompute the inodes 3481 */ 3482 #undef __FUNCT__ 3483 #define __FUNCT__ "Mat_CheckInode" 3484 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3485 { 3486 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3487 PetscErrorCode ierr; 3488 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3489 PetscTruth flag; 3490 3491 PetscFunctionBegin; 3492 if (!a->inode.use) PetscFunctionReturn(0); 3493 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3494 3495 3496 m = A->rmap->n; 3497 if (a->inode.size) {ns = a->inode.size;} 3498 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3499 3500 i = 0; 3501 node_count = 0; 3502 idx = a->j; 3503 ii = a->i; 3504 while (i < m){ /* For each row */ 3505 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3506 /* Limits the number of elements in a node to 'a->inode.limit' */ 3507 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3508 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3509 if (nzy != nzx) break; 3510 idy += nzx; /* Same nonzero pattern */ 3511 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3512 if (!flag) break; 3513 } 3514 ns[node_count++] = blk_size; 3515 idx += blk_size*nzx; 3516 i = j; 3517 } 3518 /* If not enough inodes found,, do not use inode version of the routines */ 3519 if (!a->inode.size && m && node_count > .9*m) { 3520 ierr = PetscFree(ns);CHKERRQ(ierr); 3521 a->inode.node_count = 0; 3522 a->inode.size = PETSC_NULL; 3523 a->inode.use = PETSC_FALSE; 3524 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3525 } else { 3526 A->ops->mult = MatMult_SeqAIJ_Inode; 3527 A->ops->sor = MatSOR_SeqAIJ_Inode; 3528 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3529 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3530 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3531 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3532 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3533 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3534 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3535 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode_inplace; 3536 a->inode.node_count = node_count; 3537 a->inode.size = ns; 3538 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3539 } 3540 PetscFunctionReturn(0); 3541 } 3542 3543 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3544 PetscInt __k, *__vi; \ 3545 __vi = aj + ai[row]; \ 3546 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3547 __vi = aj + adiag[row]; \ 3548 cols[nzl] = __vi[0];\ 3549 __vi = aj + adiag[row+1]+1;\ 3550 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3551 3552 3553 /* 3554 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3555 Modified from Mat_CheckInode(). 3556 3557 Input Parameters: 3558 + Mat A - ILU or LU matrix factor 3559 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3560 do not need to recompute the inodes 3561 */ 3562 #undef __FUNCT__ 3563 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3564 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3565 { 3566 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3567 PetscErrorCode ierr; 3568 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3569 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3570 PetscTruth flag; 3571 3572 PetscFunctionBegin; 3573 if (!a->inode.use) PetscFunctionReturn(0); 3574 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3575 3576 m = A->rmap->n; 3577 if (a->inode.size) {ns = a->inode.size;} 3578 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3579 3580 i = 0; 3581 node_count = 0; 3582 while (i < m){ /* For each row */ 3583 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3584 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3585 nzx = nzl1 + nzu1 + 1; 3586 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3587 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3588 3589 /* Limits the number of elements in a node to 'a->inode.limit' */ 3590 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3591 nzl2 = ai[j+1] - ai[j]; 3592 nzu2 = adiag[j] - adiag[j+1] - 1; 3593 nzy = nzl2 + nzu2 + 1; 3594 if( nzy != nzx) break; 3595 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3596 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3597 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3598 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3599 ierr = PetscFree(cols2);CHKERRQ(ierr); 3600 } 3601 ns[node_count++] = blk_size; 3602 ierr = PetscFree(cols1);CHKERRQ(ierr); 3603 i = j; 3604 } 3605 /* If not enough inodes found,, do not use inode version of the routines */ 3606 if (!a->inode.size && m && node_count > .9*m) { 3607 ierr = PetscFree(ns);CHKERRQ(ierr); 3608 a->inode.node_count = 0; 3609 a->inode.size = PETSC_NULL; 3610 a->inode.use = PETSC_FALSE; 3611 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3612 } else { 3613 A->ops->mult = 0; 3614 A->ops->sor = 0; 3615 A->ops->multadd = 0; 3616 A->ops->getrowij = 0; 3617 A->ops->restorerowij = 0; 3618 A->ops->getcolumnij = 0; 3619 A->ops->restorecolumnij = 0; 3620 A->ops->coloringpatch = 0; 3621 A->ops->multdiagonalblock = 0; 3622 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */ 3623 a->inode.node_count = node_count; 3624 a->inode.size = ns; 3625 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3626 } 3627 PetscFunctionReturn(0); 3628 } 3629 3630 /* 3631 This is really ugly. if inodes are used this replaces the 3632 permutations with ones that correspond to rows/cols of the matrix 3633 rather then inode blocks 3634 */ 3635 #undef __FUNCT__ 3636 #define __FUNCT__ "MatInodeAdjustForInodes" 3637 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3638 { 3639 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3640 3641 PetscFunctionBegin; 3642 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3643 if (f) { 3644 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3645 } 3646 PetscFunctionReturn(0); 3647 } 3648 3649 EXTERN_C_BEGIN 3650 #undef __FUNCT__ 3651 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3652 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3653 { 3654 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3655 PetscErrorCode ierr; 3656 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3657 const PetscInt *ridx,*cidx; 3658 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3659 PetscInt nslim_col,*ns_col; 3660 IS ris = *rperm,cis = *cperm; 3661 3662 PetscFunctionBegin; 3663 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3664 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3665 3666 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3667 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3668 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3669 3670 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3671 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3672 3673 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3674 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3675 3676 /* Construct the permutations for rows*/ 3677 for (i=0,row = 0; i<nslim_row; ++i){ 3678 indx = ridx[i]; 3679 start_val = tns[indx]; 3680 end_val = tns[indx + 1]; 3681 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3682 } 3683 3684 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3685 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3686 3687 /* Construct permutations for columns */ 3688 for (i=0,col=0; i<nslim_col; ++i){ 3689 indx = cidx[i]; 3690 start_val = tns[indx]; 3691 end_val = tns[indx + 1]; 3692 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3693 } 3694 3695 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3696 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3697 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3698 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3699 3700 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3701 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3702 3703 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3704 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3705 ierr = ISDestroy(cis);CHKERRQ(ierr); 3706 ierr = ISDestroy(ris);CHKERRQ(ierr); 3707 ierr = PetscFree(tns);CHKERRQ(ierr); 3708 PetscFunctionReturn(0); 3709 } 3710 EXTERN_C_END 3711 3712 #undef __FUNCT__ 3713 #define __FUNCT__ "MatInodeGetInodeSizes" 3714 /*@C 3715 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3716 3717 Collective on Mat 3718 3719 Input Parameter: 3720 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3721 3722 Output Parameter: 3723 + node_count - no of inodes present in the matrix. 3724 . sizes - an array of size node_count,with sizes of each inode. 3725 - limit - the max size used to generate the inodes. 3726 3727 Level: advanced 3728 3729 Notes: This routine returns some internal storage information 3730 of the matrix, it is intended to be used by advanced users. 3731 It should be called after the matrix is assembled. 3732 The contents of the sizes[] array should not be changed. 3733 PETSC_NULL may be passed for information not requested. 3734 3735 .keywords: matrix, seqaij, get, inode 3736 3737 .seealso: MatGetInfo() 3738 @*/ 3739 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3740 { 3741 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3742 3743 PetscFunctionBegin; 3744 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3745 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3746 if (f) { 3747 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3748 } 3749 PetscFunctionReturn(0); 3750 } 3751 3752 EXTERN_C_BEGIN 3753 #undef __FUNCT__ 3754 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3755 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3756 { 3757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3758 3759 PetscFunctionBegin; 3760 if (node_count) *node_count = a->inode.node_count; 3761 if (sizes) *sizes = a->inode.size; 3762 if (limit) *limit = a->inode.limit; 3763 PetscFunctionReturn(0); 3764 } 3765 EXTERN_C_END 3766