1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1202 1203 PetscFunctionBegin; 1204 /* MatPivotSetUp(): initialize shift context sctx */ 1205 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1206 1207 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1208 ddiag = a->diag; 1209 sctx.shift_top = info->zeropivot; 1210 for (i=0; i<n; i++) { 1211 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1212 d = (aa)[ddiag[i]]; 1213 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1214 v = aa+ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 for (j=0; j<nz; j++) 1217 rs += PetscAbsScalar(v[j]); 1218 if (rs>sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 1229 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 rtmp2 = rtmp1 + n; 1232 rtmp3 = rtmp2 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns){ 1238 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1239 } 1240 1241 /* If max inode size > 3, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1244 for (i=0,j=0; i<node_max; ++i,++j){ 1245 if (ns[i]>3) { 1246 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1259 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j=0; j<nodesz; j++,row++) { 1263 nsmap[row] = i; 1264 } 1265 } 1266 /* Using nsmap, create a reordered ns structure */ 1267 for (i=0,j=0; i< node_max; i++) { 1268 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1269 tmp_vec2[i] = nodesz; 1270 j += nodesz; 1271 } 1272 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1273 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1274 1275 /* Now use the correct ns */ 1276 ns = tmp_vec2; 1277 1278 do { 1279 sctx.useshift = PETSC_FALSE; 1280 /* Now loop over each block-row, and do the factorization */ 1281 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1282 nodesz = ns[inod]; 1283 1284 switch (nodesz){ 1285 case 1: 1286 /*----------*/ 1287 /* zero rtmp1 */ 1288 /* L part */ 1289 nz = bi[i+1] - bi[i]; 1290 bjtmp = bj + bi[i]; 1291 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1292 1293 /* U part */ 1294 nz = bdiag[i]-bdiag[i+1]; 1295 bjtmp = bj + bdiag[i+1]+1; 1296 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1297 1298 /* load in initial (unfactored row) */ 1299 nz = ai[r[i]+1] - ai[r[i]]; 1300 ajtmp = aj + ai[r[i]]; 1301 v = aa + ai[r[i]]; 1302 for (j=0; j<nz; j++) { 1303 rtmp1[ics[ajtmp[j]]] = v[j]; 1304 } 1305 /* ZeropivotApply() */ 1306 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1307 1308 /* elimination */ 1309 bjtmp = bj + bi[i]; 1310 row = *bjtmp++; 1311 nzL = bi[i+1] - bi[i]; 1312 for(k=0; k < nzL;k++) { 1313 pc = rtmp1 + row; 1314 if (*pc != 0.0) { 1315 pv = b->a + bdiag[row]; 1316 mul1 = *pc * (*pv); 1317 *pc = mul1; 1318 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1319 pv = b->a + bdiag[row+1]+1; 1320 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1321 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1322 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1323 } 1324 row = *bjtmp++; 1325 } 1326 1327 /* finished row so stick it into b->a */ 1328 rs = 0.0; 1329 /* L part */ 1330 pv = b->a + bi[i] ; 1331 pj = b->j + bi[i] ; 1332 nz = bi[i+1] - bi[i]; 1333 for (j=0; j<nz; j++) { 1334 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1335 } 1336 1337 /* U part */ 1338 pv = b->a + bdiag[i+1]+1; 1339 pj = b->j + bdiag[i+1]+1; 1340 nz = bdiag[i] - bdiag[i+1]-1; 1341 for (j=0; j<nz; j++) { 1342 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1343 } 1344 1345 /* Check zero pivot */ 1346 sctx.rs = rs; 1347 sctx.pv = rtmp1[i]; 1348 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1349 1350 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1351 pv = b->a + bdiag[i]; 1352 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1353 break; 1354 1355 case 2: 1356 /*----------*/ 1357 /* zero rtmp1 and rtmp2 */ 1358 /* L part */ 1359 nz = bi[i+1] - bi[i]; 1360 bjtmp = bj + bi[i]; 1361 for (j=0; j<nz; j++) { 1362 col = bjtmp[j]; 1363 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1364 } 1365 1366 /* U part */ 1367 nz = bdiag[i]-bdiag[i+1]; 1368 bjtmp = bj + bdiag[i+1]+1; 1369 for (j=0; j<nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1372 } 1373 1374 /* load in initial (unfactored row) */ 1375 nz = ai[r[i]+1] - ai[r[i]]; 1376 ajtmp = aj + ai[r[i]]; 1377 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1378 for (j=0; j<nz; j++) { 1379 col = ics[ajtmp[j]]; 1380 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1381 } 1382 /* ZeropivotApply(): shift the diagonal of the matrix */ 1383 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1384 1385 /* elimination */ 1386 bjtmp = bj + bi[i]; 1387 row = *bjtmp++; /* pivot row */ 1388 nzL = bi[i+1] - bi[i]; 1389 for(k=0; k < nzL;k++) { 1390 pc1 = rtmp1 + row; 1391 pc2 = rtmp2 + row; 1392 if (*pc1 != 0.0 || *pc2 != 0.0) { 1393 pv = b->a + bdiag[row]; 1394 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1395 *pc1 = mul1; *pc2 = mul2; 1396 1397 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1398 pv = b->a + bdiag[row+1]+1; 1399 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1400 for (j=0; j<nz; j++){ 1401 col = pj[j]; 1402 rtmp1[col] -= mul1 * pv[j]; 1403 rtmp2[col] -= mul2 * pv[j]; 1404 } 1405 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1406 } 1407 row = *bjtmp++; 1408 } 1409 1410 /* finished row i; check zero pivot, then stick row i into b->a */ 1411 rs = 0.0; 1412 /* L part */ 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i] ; 1415 nz = bi[i+1] - bi[i]; 1416 for (j=0; j<nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1419 } 1420 /* U part */ 1421 pc1 = b->a + bdiag[i+1]+1; 1422 pj = b->j + bdiag[i+1]+1; 1423 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1424 for (j=0; j<nz; j++) { 1425 col = pj[j]; 1426 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1427 } 1428 1429 sctx.rs = rs; 1430 sctx.pv = rtmp1[i]; 1431 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1432 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1433 *pc1 = 1.0/sctx.pv; 1434 1435 /* Now take care of diagonal 2x2 block. */ 1436 pc2 = rtmp2 + i; 1437 if (*pc2 != 0.0){ 1438 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1439 *pc2 = mul1; /* insert L entry */ 1440 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1441 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1442 for (j=0; j<nz; j++) { 1443 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1444 } 1445 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1446 } 1447 1448 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1449 rs = 0.0; 1450 /* L part */ 1451 pc2 = b->a + bi[i+1]; 1452 pj = b->j + bi[i+1] ; 1453 nz = bi[i+2] - bi[i+1]; 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 /* U part */ 1459 pc2 = b->a + bdiag[i+2]+1; 1460 pj = b->j + bdiag[i+2]+1; 1461 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1462 for (j=0; j<nz; j++) { 1463 col = pj[j]; 1464 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1465 } 1466 1467 sctx.rs = rs; 1468 sctx.pv = rtmp2[i+1]; 1469 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1470 pc2 = b->a + bdiag[i+1]; 1471 *pc2 = 1.0/sctx.pv; 1472 break; 1473 1474 case 3: 1475 /*----------*/ 1476 /* zero rtmp */ 1477 /* L part */ 1478 nz = bi[i+1] - bi[i]; 1479 bjtmp = bj + bi[i]; 1480 for (j=0; j<nz; j++) { 1481 col = bjtmp[j]; 1482 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1483 } 1484 1485 /* U part */ 1486 nz = bdiag[i]-bdiag[i+1]; 1487 bjtmp = bj + bdiag[i+1]+1; 1488 for (j=0; j<nz; j++) { 1489 col = bjtmp[j]; 1490 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1491 } 1492 1493 /* load in initial (unfactored row) */ 1494 nz = ai[r[i]+1] - ai[r[i]]; 1495 ajtmp = aj + ai[r[i]]; 1496 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1497 for (j=0; j<nz; j++) { 1498 col = ics[ajtmp[j]]; 1499 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1500 } 1501 /* ZeropivotApply(): shift the diagonal of the matrix */ 1502 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1503 1504 /* elimination */ 1505 bjtmp = bj + bi[i]; 1506 row = *bjtmp++; /* pivot row */ 1507 nzL = bi[i+1] - bi[i]; 1508 for(k=0; k < nzL;k++) { 1509 pc1 = rtmp1 + row; 1510 pc2 = rtmp2 + row; 1511 pc3 = rtmp3 + row; 1512 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1513 pv = b->a + bdiag[row]; 1514 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1515 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1516 1517 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1518 pv = b->a + bdiag[row+1]+1; 1519 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1520 for (j=0; j<nz; j++){ 1521 col = pj[j]; 1522 rtmp1[col] -= mul1 * pv[j]; 1523 rtmp2[col] -= mul2 * pv[j]; 1524 rtmp3[col] -= mul3 * pv[j]; 1525 } 1526 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1527 } 1528 row = *bjtmp++; 1529 } 1530 1531 /* finished row i; check zero pivot, then stick row i into b->a */ 1532 rs = 0.0; 1533 /* L part */ 1534 pc1 = b->a + bi[i]; 1535 pj = b->j + bi[i] ; 1536 nz = bi[i+1] - bi[i]; 1537 for (j=0; j<nz; j++) { 1538 col = pj[j]; 1539 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1540 } 1541 /* U part */ 1542 pc1 = b->a + bdiag[i+1]+1; 1543 pj = b->j + bdiag[i+1]+1; 1544 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1545 for (j=0; j<nz; j++) { 1546 col = pj[j]; 1547 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1548 } 1549 1550 sctx.rs = rs; 1551 sctx.pv = rtmp1[i]; 1552 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1553 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1554 *pc1 = 1.0/sctx.pv; 1555 1556 /* Now take care of 1st column of diagonal 3x3 block. */ 1557 pc2 = rtmp2 + i; 1558 pc3 = rtmp3 + i; 1559 if (*pc2 != 0.0 || *pc3 != 0.0){ 1560 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1561 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1562 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1563 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1564 for (j=0; j<nz; j++) { 1565 col = pj[j]; 1566 rtmp2[col] -= mul2 * rtmp1[col]; 1567 rtmp3[col] -= mul3 * rtmp1[col]; 1568 } 1569 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1570 } 1571 1572 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1573 rs = 0.0; 1574 /* L part */ 1575 pc2 = b->a + bi[i+1]; 1576 pj = b->j + bi[i+1] ; 1577 nz = bi[i+2] - bi[i+1]; 1578 for (j=0; j<nz; j++) { 1579 col = pj[j]; 1580 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1581 } 1582 /* U part */ 1583 pc2 = b->a + bdiag[i+2]+1; 1584 pj = b->j + bdiag[i+2]+1; 1585 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1586 for (j=0; j<nz; j++) { 1587 col = pj[j]; 1588 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1589 } 1590 1591 sctx.rs = rs; 1592 sctx.pv = rtmp2[i+1]; 1593 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1594 pc2 = b->a + bdiag[i+1]; 1595 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1596 1597 /* Now take care of 2nd column of diagonal 3x3 block. */ 1598 pc3 = rtmp3 + i+1; 1599 if (*pc3 != 0.0){ 1600 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1601 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1602 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1603 for (j=0; j<nz; j++) { 1604 col = pj[j]; 1605 rtmp3[col] -= mul3 * rtmp2[col]; 1606 } 1607 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1608 } 1609 1610 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1611 rs = 0.0; 1612 /* L part */ 1613 pc3 = b->a + bi[i+2]; 1614 pj = b->j + bi[i+2] ; 1615 nz = bi[i+3] - bi[i+2]; 1616 for (j=0; j<nz; j++) { 1617 col = pj[j]; 1618 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1619 } 1620 /* U part */ 1621 pc3 = b->a + bdiag[i+3]+1; 1622 pj = b->j + bdiag[i+3]+1; 1623 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1624 for (j=0; j<nz; j++) { 1625 col = pj[j]; 1626 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1627 } 1628 1629 sctx.rs = rs; 1630 sctx.pv = rtmp3[i+2]; 1631 ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr); 1632 pc3 = b->a + bdiag[i+2]; 1633 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1634 break; 1635 1636 default: 1637 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1638 } 1639 i += nodesz; /* Update the row */ 1640 } 1641 1642 /* MatPivotRefine() */ 1643 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1644 /* 1645 * if no shift in this attempt & shifting & started shifting & can refine, 1646 * then try lower shift 1647 */ 1648 sctx.shift_hi = sctx.shift_fraction; 1649 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1650 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1651 sctx.useshift = PETSC_TRUE; 1652 sctx.nshift++; 1653 } 1654 } while (sctx.useshift); 1655 1656 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1657 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1658 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1659 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1660 1661 C->ops->solve = MatSolve_SeqAIJ_Inode; 1662 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1663 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1664 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1665 C->ops->matsolve = MatMatSolve_SeqAIJ; 1666 C->assembled = PETSC_TRUE; 1667 C->preallocated = PETSC_TRUE; 1668 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1669 1670 /* MatShiftView(A,info,&sctx) */ 1671 if (sctx.nshift){ 1672 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1673 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1674 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1675 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1676 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1677 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1678 } 1679 } 1680 PetscFunctionReturn(0); 1681 } 1682 1683 #undef __FUNCT__ 1684 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1685 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1686 { 1687 Mat C = B; 1688 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1689 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1690 PetscErrorCode ierr; 1691 const PetscInt *r,*ic,*c,*ics; 1692 PetscInt n = A->rmap->n,*bi = b->i; 1693 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1694 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1695 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1696 PetscScalar mul1,mul2,mul3,tmp; 1697 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1698 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1699 PetscReal rs=0.0; 1700 FactorShiftCtx sctx; 1701 PetscInt newshift; 1702 1703 PetscFunctionBegin; 1704 sctx.shift_top = 0; 1705 sctx.nshift_max = 0; 1706 sctx.shift_lo = 0; 1707 sctx.shift_hi = 0; 1708 sctx.shift_fraction = 0; 1709 1710 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1711 if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1712 sctx.shift_top = 0; 1713 for (i=0; i<n; i++) { 1714 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1715 rs = 0.0; 1716 ajtmp = aj + ai[i]; 1717 rtmp1 = aa + ai[i]; 1718 nz = ai[i+1] - ai[i]; 1719 for (j=0; j<nz; j++){ 1720 if (*ajtmp != i){ 1721 rs += PetscAbsScalar(*rtmp1++); 1722 } else { 1723 rs -= PetscRealPart(*rtmp1++); 1724 } 1725 ajtmp++; 1726 } 1727 if (rs>sctx.shift_top) sctx.shift_top = rs; 1728 } 1729 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1730 sctx.shift_top *= 1.1; 1731 sctx.nshift_max = 5; 1732 sctx.shift_lo = 0.; 1733 sctx.shift_hi = 1.; 1734 } 1735 sctx.shift_amount = 0; 1736 sctx.nshift = 0; 1737 1738 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1739 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1740 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1741 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1742 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1743 ics = ic ; 1744 rtmp22 = rtmp11 + n; 1745 rtmp33 = rtmp22 + n; 1746 1747 node_max = a->inode.node_count; 1748 ns = a->inode.size; 1749 if (!ns){ 1750 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1751 } 1752 1753 /* If max inode size > 3, split it into two inodes.*/ 1754 /* also map the inode sizes according to the ordering */ 1755 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1756 for (i=0,j=0; i<node_max; ++i,++j){ 1757 if (ns[i]>3) { 1758 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1759 ++j; 1760 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1761 } else { 1762 tmp_vec1[j] = ns[i]; 1763 } 1764 } 1765 /* Use the correct node_max */ 1766 node_max = j; 1767 1768 /* Now reorder the inode info based on mat re-ordering info */ 1769 /* First create a row -> inode_size_array_index map */ 1770 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1771 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1772 for (i=0,row=0; i<node_max; i++) { 1773 nodesz = tmp_vec1[i]; 1774 for (j=0; j<nodesz; j++,row++) { 1775 nsmap[row] = i; 1776 } 1777 } 1778 /* Using nsmap, create a reordered ns structure */ 1779 for (i=0,j=0; i< node_max; i++) { 1780 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1781 tmp_vec2[i] = nodesz; 1782 j += nodesz; 1783 } 1784 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1785 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1786 /* Now use the correct ns */ 1787 ns = tmp_vec2; 1788 1789 do { 1790 sctx.useshift = PETSC_FALSE; 1791 /* Now loop over each block-row, and do the factorization */ 1792 for (i=0,row=0; i<node_max; i++) { 1793 nodesz = ns[i]; 1794 nz = bi[row+1] - bi[row]; 1795 bjtmp = bj + bi[row]; 1796 1797 switch (nodesz){ 1798 case 1: 1799 for (j=0; j<nz; j++){ 1800 idx = bjtmp[j]; 1801 rtmp11[idx] = 0.0; 1802 } 1803 1804 /* load in initial (unfactored row) */ 1805 idx = r[row]; 1806 nz_tmp = ai[idx+1] - ai[idx]; 1807 ajtmp = aj + ai[idx]; 1808 v1 = aa + ai[idx]; 1809 1810 for (j=0; j<nz_tmp; j++) { 1811 idx = ics[ajtmp[j]]; 1812 rtmp11[idx] = v1[j]; 1813 } 1814 rtmp11[ics[r[row]]] += sctx.shift_amount; 1815 1816 prow = *bjtmp++ ; 1817 while (prow < row) { 1818 pc1 = rtmp11 + prow; 1819 if (*pc1 != 0.0){ 1820 pv = ba + bd[prow]; 1821 pj = nbj + bd[prow]; 1822 mul1 = *pc1 * *pv++; 1823 *pc1 = mul1; 1824 nz_tmp = bi[prow+1] - bd[prow] - 1; 1825 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1826 for (j=0; j<nz_tmp; j++) { 1827 tmp = pv[j]; 1828 idx = pj[j]; 1829 rtmp11[idx] -= mul1 * tmp; 1830 } 1831 } 1832 prow = *bjtmp++ ; 1833 } 1834 pj = bj + bi[row]; 1835 pc1 = ba + bi[row]; 1836 1837 sctx.pv = rtmp11[row]; 1838 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1839 rs = 0.0; 1840 for (j=0; j<nz; j++) { 1841 idx = pj[j]; 1842 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1843 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1844 } 1845 sctx.rs = rs; 1846 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1847 if (newshift == 1) goto endofwhile; 1848 break; 1849 1850 case 2: 1851 for (j=0; j<nz; j++) { 1852 idx = bjtmp[j]; 1853 rtmp11[idx] = 0.0; 1854 rtmp22[idx] = 0.0; 1855 } 1856 1857 /* load in initial (unfactored row) */ 1858 idx = r[row]; 1859 nz_tmp = ai[idx+1] - ai[idx]; 1860 ajtmp = aj + ai[idx]; 1861 v1 = aa + ai[idx]; 1862 v2 = aa + ai[idx+1]; 1863 for (j=0; j<nz_tmp; j++) { 1864 idx = ics[ajtmp[j]]; 1865 rtmp11[idx] = v1[j]; 1866 rtmp22[idx] = v2[j]; 1867 } 1868 rtmp11[ics[r[row]]] += sctx.shift_amount; 1869 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1870 1871 prow = *bjtmp++ ; 1872 while (prow < row) { 1873 pc1 = rtmp11 + prow; 1874 pc2 = rtmp22 + prow; 1875 if (*pc1 != 0.0 || *pc2 != 0.0){ 1876 pv = ba + bd[prow]; 1877 pj = nbj + bd[prow]; 1878 mul1 = *pc1 * *pv; 1879 mul2 = *pc2 * *pv; 1880 ++pv; 1881 *pc1 = mul1; 1882 *pc2 = mul2; 1883 1884 nz_tmp = bi[prow+1] - bd[prow] - 1; 1885 for (j=0; j<nz_tmp; j++) { 1886 tmp = pv[j]; 1887 idx = pj[j]; 1888 rtmp11[idx] -= mul1 * tmp; 1889 rtmp22[idx] -= mul2 * tmp; 1890 } 1891 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1892 } 1893 prow = *bjtmp++ ; 1894 } 1895 1896 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1897 pc1 = rtmp11 + prow; 1898 pc2 = rtmp22 + prow; 1899 1900 sctx.pv = *pc1; 1901 pj = bj + bi[prow]; 1902 rs = 0.0; 1903 for (j=0; j<nz; j++){ 1904 idx = pj[j]; 1905 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1906 } 1907 sctx.rs = rs; 1908 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1909 if (newshift == 1) goto endofwhile; 1910 1911 if (*pc2 != 0.0){ 1912 pj = nbj + bd[prow]; 1913 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1914 *pc2 = mul2; 1915 nz_tmp = bi[prow+1] - bd[prow] - 1; 1916 for (j=0; j<nz_tmp; j++) { 1917 idx = pj[j] ; 1918 tmp = rtmp11[idx]; 1919 rtmp22[idx] -= mul2 * tmp; 1920 } 1921 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1922 } 1923 1924 pj = bj + bi[row]; 1925 pc1 = ba + bi[row]; 1926 pc2 = ba + bi[row+1]; 1927 1928 sctx.pv = rtmp22[row+1]; 1929 rs = 0.0; 1930 rtmp11[row] = 1.0/rtmp11[row]; 1931 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1932 /* copy row entries from dense representation to sparse */ 1933 for (j=0; j<nz; j++) { 1934 idx = pj[j]; 1935 pc1[j] = rtmp11[idx]; 1936 pc2[j] = rtmp22[idx]; 1937 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1938 } 1939 sctx.rs = rs; 1940 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1941 if (newshift == 1) goto endofwhile; 1942 break; 1943 1944 case 3: 1945 for (j=0; j<nz; j++) { 1946 idx = bjtmp[j]; 1947 rtmp11[idx] = 0.0; 1948 rtmp22[idx] = 0.0; 1949 rtmp33[idx] = 0.0; 1950 } 1951 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1952 idx = r[row]; 1953 nz_tmp = ai[idx+1] - ai[idx]; 1954 ajtmp = aj + ai[idx]; 1955 v1 = aa + ai[idx]; 1956 v2 = aa + ai[idx+1]; 1957 v3 = aa + ai[idx+2]; 1958 for (j=0; j<nz_tmp; j++) { 1959 idx = ics[ajtmp[j]]; 1960 rtmp11[idx] = v1[j]; 1961 rtmp22[idx] = v2[j]; 1962 rtmp33[idx] = v3[j]; 1963 } 1964 rtmp11[ics[r[row]]] += sctx.shift_amount; 1965 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1966 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1967 1968 /* loop over all pivot row blocks above this row block */ 1969 prow = *bjtmp++ ; 1970 while (prow < row) { 1971 pc1 = rtmp11 + prow; 1972 pc2 = rtmp22 + prow; 1973 pc3 = rtmp33 + prow; 1974 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1975 pv = ba + bd[prow]; 1976 pj = nbj + bd[prow]; 1977 mul1 = *pc1 * *pv; 1978 mul2 = *pc2 * *pv; 1979 mul3 = *pc3 * *pv; 1980 ++pv; 1981 *pc1 = mul1; 1982 *pc2 = mul2; 1983 *pc3 = mul3; 1984 1985 nz_tmp = bi[prow+1] - bd[prow] - 1; 1986 /* update this row based on pivot row */ 1987 for (j=0; j<nz_tmp; j++) { 1988 tmp = pv[j]; 1989 idx = pj[j]; 1990 rtmp11[idx] -= mul1 * tmp; 1991 rtmp22[idx] -= mul2 * tmp; 1992 rtmp33[idx] -= mul3 * tmp; 1993 } 1994 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1995 } 1996 prow = *bjtmp++ ; 1997 } 1998 1999 /* Now take care of diagonal 3x3 block in this set of rows */ 2000 /* note: prow = row here */ 2001 pc1 = rtmp11 + prow; 2002 pc2 = rtmp22 + prow; 2003 pc3 = rtmp33 + prow; 2004 2005 sctx.pv = *pc1; 2006 pj = bj + bi[prow]; 2007 rs = 0.0; 2008 for (j=0; j<nz; j++){ 2009 idx = pj[j]; 2010 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2011 } 2012 sctx.rs = rs; 2013 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2014 if (newshift == 1) goto endofwhile; 2015 2016 if (*pc2 != 0.0 || *pc3 != 0.0){ 2017 mul2 = (*pc2)/(*pc1); 2018 mul3 = (*pc3)/(*pc1); 2019 *pc2 = mul2; 2020 *pc3 = mul3; 2021 nz_tmp = bi[prow+1] - bd[prow] - 1; 2022 pj = nbj + bd[prow]; 2023 for (j=0; j<nz_tmp; j++) { 2024 idx = pj[j] ; 2025 tmp = rtmp11[idx]; 2026 rtmp22[idx] -= mul2 * tmp; 2027 rtmp33[idx] -= mul3 * tmp; 2028 } 2029 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2030 } 2031 ++prow; 2032 2033 pc2 = rtmp22 + prow; 2034 pc3 = rtmp33 + prow; 2035 sctx.pv = *pc2; 2036 pj = bj + bi[prow]; 2037 rs = 0.0; 2038 for (j=0; j<nz; j++){ 2039 idx = pj[j]; 2040 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2041 } 2042 sctx.rs = rs; 2043 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2044 if (newshift == 1) goto endofwhile; 2045 2046 if (*pc3 != 0.0){ 2047 mul3 = (*pc3)/(*pc2); 2048 *pc3 = mul3; 2049 pj = nbj + bd[prow]; 2050 nz_tmp = bi[prow+1] - bd[prow] - 1; 2051 for (j=0; j<nz_tmp; j++) { 2052 idx = pj[j] ; 2053 tmp = rtmp22[idx]; 2054 rtmp33[idx] -= mul3 * tmp; 2055 } 2056 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2057 } 2058 2059 pj = bj + bi[row]; 2060 pc1 = ba + bi[row]; 2061 pc2 = ba + bi[row+1]; 2062 pc3 = ba + bi[row+2]; 2063 2064 sctx.pv = rtmp33[row+2]; 2065 rs = 0.0; 2066 rtmp11[row] = 1.0/rtmp11[row]; 2067 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2068 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2069 /* copy row entries from dense representation to sparse */ 2070 for (j=0; j<nz; j++) { 2071 idx = pj[j]; 2072 pc1[j] = rtmp11[idx]; 2073 pc2[j] = rtmp22[idx]; 2074 pc3[j] = rtmp33[idx]; 2075 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2076 } 2077 2078 sctx.rs = rs; 2079 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2080 if (newshift == 1) goto endofwhile; 2081 break; 2082 2083 default: 2084 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2085 } 2086 row += nodesz; /* Update the row */ 2087 } 2088 endofwhile:; 2089 } while (sctx.useshift); 2090 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2091 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2092 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2093 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2094 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2095 (B)->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 2096 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2097 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2098 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2099 C->assembled = PETSC_TRUE; 2100 C->preallocated = PETSC_TRUE; 2101 if (sctx.nshift) { 2102 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 2103 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2104 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 2105 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2106 } 2107 } 2108 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2109 PetscFunctionReturn(0); 2110 } 2111 2112 2113 /* ----------------------------------------------------------- */ 2114 #undef __FUNCT__ 2115 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2116 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2117 { 2118 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2119 IS iscol = a->col,isrow = a->row; 2120 PetscErrorCode ierr; 2121 const PetscInt *r,*c,*rout,*cout; 2122 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2123 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2124 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2125 PetscScalar sum1,sum2,sum3,sum4,sum5; 2126 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2127 const PetscScalar *b; 2128 2129 PetscFunctionBegin; 2130 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2131 node_max = a->inode.node_count; 2132 ns = a->inode.size; /* Node Size array */ 2133 2134 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2135 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2136 tmp = a->solve_work; 2137 2138 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2139 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2140 2141 /* forward solve the lower triangular */ 2142 tmps = tmp ; 2143 aa = a_a ; 2144 aj = a_j ; 2145 ad = a->diag; 2146 2147 for (i = 0,row = 0; i< node_max; ++i){ 2148 nsz = ns[i]; 2149 aii = ai[row]; 2150 v1 = aa + aii; 2151 vi = aj + aii; 2152 nz = ai[row+1]- ai[row]; 2153 2154 if (i < node_max-1) { 2155 /* Prefetch the indices for the next block */ 2156 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2157 /* Prefetch the data for the next block */ 2158 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2159 } 2160 2161 switch (nsz){ /* Each loop in 'case' is unrolled */ 2162 case 1 : 2163 sum1 = b[r[row]]; 2164 for(j=0; j<nz-1; j+=2){ 2165 i0 = vi[j]; 2166 i1 = vi[j+1]; 2167 tmp0 = tmps[i0]; 2168 tmp1 = tmps[i1]; 2169 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2170 } 2171 if(j == nz-1){ 2172 tmp0 = tmps[vi[j]]; 2173 sum1 -= v1[j]*tmp0; 2174 } 2175 tmp[row++]=sum1; 2176 break; 2177 case 2: 2178 sum1 = b[r[row]]; 2179 sum2 = b[r[row+1]]; 2180 v2 = aa + ai[row+1]; 2181 2182 for(j=0; j<nz-1; j+=2){ 2183 i0 = vi[j]; 2184 i1 = vi[j+1]; 2185 tmp0 = tmps[i0]; 2186 tmp1 = tmps[i1]; 2187 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2188 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2189 } 2190 if(j == nz-1){ 2191 tmp0 = tmps[vi[j]]; 2192 sum1 -= v1[j] *tmp0; 2193 sum2 -= v2[j] *tmp0; 2194 } 2195 sum2 -= v2[nz] * sum1; 2196 tmp[row ++]=sum1; 2197 tmp[row ++]=sum2; 2198 break; 2199 case 3: 2200 sum1 = b[r[row]]; 2201 sum2 = b[r[row+1]]; 2202 sum3 = b[r[row+2]]; 2203 v2 = aa + ai[row+1]; 2204 v3 = aa + ai[row+2]; 2205 2206 for (j=0; j<nz-1; j+=2){ 2207 i0 = vi[j]; 2208 i1 = vi[j+1]; 2209 tmp0 = tmps[i0]; 2210 tmp1 = tmps[i1]; 2211 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2212 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2213 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2214 } 2215 if (j == nz-1){ 2216 tmp0 = tmps[vi[j]]; 2217 sum1 -= v1[j] *tmp0; 2218 sum2 -= v2[j] *tmp0; 2219 sum3 -= v3[j] *tmp0; 2220 } 2221 sum2 -= v2[nz] * sum1; 2222 sum3 -= v3[nz] * sum1; 2223 sum3 -= v3[nz+1] * sum2; 2224 tmp[row ++]=sum1; 2225 tmp[row ++]=sum2; 2226 tmp[row ++]=sum3; 2227 break; 2228 2229 case 4: 2230 sum1 = b[r[row]]; 2231 sum2 = b[r[row+1]]; 2232 sum3 = b[r[row+2]]; 2233 sum4 = b[r[row+3]]; 2234 v2 = aa + ai[row+1]; 2235 v3 = aa + ai[row+2]; 2236 v4 = aa + ai[row+3]; 2237 2238 for (j=0; j<nz-1; j+=2){ 2239 i0 = vi[j]; 2240 i1 = vi[j+1]; 2241 tmp0 = tmps[i0]; 2242 tmp1 = tmps[i1]; 2243 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2244 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2245 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2246 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2247 } 2248 if (j == nz-1){ 2249 tmp0 = tmps[vi[j]]; 2250 sum1 -= v1[j] *tmp0; 2251 sum2 -= v2[j] *tmp0; 2252 sum3 -= v3[j] *tmp0; 2253 sum4 -= v4[j] *tmp0; 2254 } 2255 sum2 -= v2[nz] * sum1; 2256 sum3 -= v3[nz] * sum1; 2257 sum4 -= v4[nz] * sum1; 2258 sum3 -= v3[nz+1] * sum2; 2259 sum4 -= v4[nz+1] * sum2; 2260 sum4 -= v4[nz+2] * sum3; 2261 2262 tmp[row ++]=sum1; 2263 tmp[row ++]=sum2; 2264 tmp[row ++]=sum3; 2265 tmp[row ++]=sum4; 2266 break; 2267 case 5: 2268 sum1 = b[r[row]]; 2269 sum2 = b[r[row+1]]; 2270 sum3 = b[r[row+2]]; 2271 sum4 = b[r[row+3]]; 2272 sum5 = b[r[row+4]]; 2273 v2 = aa + ai[row+1]; 2274 v3 = aa + ai[row+2]; 2275 v4 = aa + ai[row+3]; 2276 v5 = aa + ai[row+4]; 2277 2278 for (j=0; j<nz-1; j+=2){ 2279 i0 = vi[j]; 2280 i1 = vi[j+1]; 2281 tmp0 = tmps[i0]; 2282 tmp1 = tmps[i1]; 2283 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2284 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2285 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2286 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2287 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2288 } 2289 if (j == nz-1){ 2290 tmp0 = tmps[vi[j]]; 2291 sum1 -= v1[j] *tmp0; 2292 sum2 -= v2[j] *tmp0; 2293 sum3 -= v3[j] *tmp0; 2294 sum4 -= v4[j] *tmp0; 2295 sum5 -= v5[j] *tmp0; 2296 } 2297 2298 sum2 -= v2[nz] * sum1; 2299 sum3 -= v3[nz] * sum1; 2300 sum4 -= v4[nz] * sum1; 2301 sum5 -= v5[nz] * sum1; 2302 sum3 -= v3[nz+1] * sum2; 2303 sum4 -= v4[nz+1] * sum2; 2304 sum5 -= v5[nz+1] * sum2; 2305 sum4 -= v4[nz+2] * sum3; 2306 sum5 -= v5[nz+2] * sum3; 2307 sum5 -= v5[nz+3] * sum4; 2308 2309 tmp[row ++]=sum1; 2310 tmp[row ++]=sum2; 2311 tmp[row ++]=sum3; 2312 tmp[row ++]=sum4; 2313 tmp[row ++]=sum5; 2314 break; 2315 default: 2316 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2317 } 2318 } 2319 /* backward solve the upper triangular */ 2320 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2321 nsz = ns[i]; 2322 aii = ad[row+1] + 1; 2323 v1 = aa + aii; 2324 vi = aj + aii; 2325 nz = ad[row]- ad[row+1] - 1; 2326 2327 if (i > 0) { 2328 /* Prefetch the indices for the next block */ 2329 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2330 /* Prefetch the data for the next block */ 2331 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2332 } 2333 2334 switch (nsz){ /* Each loop in 'case' is unrolled */ 2335 case 1 : 2336 sum1 = tmp[row]; 2337 2338 for(j=0 ; j<nz-1; j+=2){ 2339 i0 = vi[j]; 2340 i1 = vi[j+1]; 2341 tmp0 = tmps[i0]; 2342 tmp1 = tmps[i1]; 2343 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2344 } 2345 if (j == nz-1){ 2346 tmp0 = tmps[vi[j]]; 2347 sum1 -= v1[j]*tmp0; 2348 } 2349 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2350 break; 2351 case 2 : 2352 sum1 = tmp[row]; 2353 sum2 = tmp[row-1]; 2354 v2 = aa + ad[row] + 1; 2355 for (j=0 ; j<nz-1; j+=2){ 2356 i0 = vi[j]; 2357 i1 = vi[j+1]; 2358 tmp0 = tmps[i0]; 2359 tmp1 = tmps[i1]; 2360 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2361 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2362 } 2363 if (j == nz-1){ 2364 tmp0 = tmps[vi[j]]; 2365 sum1 -= v1[j]* tmp0; 2366 sum2 -= v2[j+1]* tmp0; 2367 } 2368 2369 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2370 sum2 -= v2[0] * tmp0; 2371 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2372 break; 2373 case 3 : 2374 sum1 = tmp[row]; 2375 sum2 = tmp[row -1]; 2376 sum3 = tmp[row -2]; 2377 v2 = aa + ad[row] + 1; 2378 v3 = aa + ad[row -1] + 1; 2379 for (j=0 ; j<nz-1; j+=2){ 2380 i0 = vi[j]; 2381 i1 = vi[j+1]; 2382 tmp0 = tmps[i0]; 2383 tmp1 = tmps[i1]; 2384 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2385 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2386 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2387 } 2388 if (j== nz-1){ 2389 tmp0 = tmps[vi[j]]; 2390 sum1 -= v1[j] * tmp0; 2391 sum2 -= v2[j+1] * tmp0; 2392 sum3 -= v3[j+2] * tmp0; 2393 } 2394 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2395 sum2 -= v2[0]* tmp0; 2396 sum3 -= v3[1] * tmp0; 2397 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2398 sum3 -= v3[0]* tmp0; 2399 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2400 2401 break; 2402 case 4 : 2403 sum1 = tmp[row]; 2404 sum2 = tmp[row -1]; 2405 sum3 = tmp[row -2]; 2406 sum4 = tmp[row -3]; 2407 v2 = aa + ad[row]+1; 2408 v3 = aa + ad[row -1]+1; 2409 v4 = aa + ad[row -2]+1; 2410 2411 for (j=0 ; j<nz-1; j+=2){ 2412 i0 = vi[j]; 2413 i1 = vi[j+1]; 2414 tmp0 = tmps[i0]; 2415 tmp1 = tmps[i1]; 2416 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2417 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2418 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2419 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2420 } 2421 if (j== nz-1){ 2422 tmp0 = tmps[vi[j]]; 2423 sum1 -= v1[j] * tmp0; 2424 sum2 -= v2[j+1] * tmp0; 2425 sum3 -= v3[j+2] * tmp0; 2426 sum4 -= v4[j+3] * tmp0; 2427 } 2428 2429 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2430 sum2 -= v2[0] * tmp0; 2431 sum3 -= v3[1] * tmp0; 2432 sum4 -= v4[2] * tmp0; 2433 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2434 sum3 -= v3[0] * tmp0; 2435 sum4 -= v4[1] * tmp0; 2436 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2437 sum4 -= v4[0] * tmp0; 2438 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2439 break; 2440 case 5 : 2441 sum1 = tmp[row]; 2442 sum2 = tmp[row -1]; 2443 sum3 = tmp[row -2]; 2444 sum4 = tmp[row -3]; 2445 sum5 = tmp[row -4]; 2446 v2 = aa + ad[row]+1; 2447 v3 = aa + ad[row -1]+1; 2448 v4 = aa + ad[row -2]+1; 2449 v5 = aa + ad[row -3]+1; 2450 for (j=0 ; j<nz-1; j+=2){ 2451 i0 = vi[j]; 2452 i1 = vi[j+1]; 2453 tmp0 = tmps[i0]; 2454 tmp1 = tmps[i1]; 2455 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2456 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2457 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2458 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2459 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2460 } 2461 if (j==nz-1){ 2462 tmp0 = tmps[vi[j]]; 2463 sum1 -= v1[j] * tmp0; 2464 sum2 -= v2[j+1] * tmp0; 2465 sum3 -= v3[j+2] * tmp0; 2466 sum4 -= v4[j+3] * tmp0; 2467 sum5 -= v5[j+4] * tmp0; 2468 } 2469 2470 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2471 sum2 -= v2[0] * tmp0; 2472 sum3 -= v3[1] * tmp0; 2473 sum4 -= v4[2] * tmp0; 2474 sum5 -= v5[3] * tmp0; 2475 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2476 sum3 -= v3[0] * tmp0; 2477 sum4 -= v4[1] * tmp0; 2478 sum5 -= v5[2] * tmp0; 2479 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2480 sum4 -= v4[0] * tmp0; 2481 sum5 -= v5[1] * tmp0; 2482 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2483 sum5 -= v5[0] * tmp0; 2484 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2485 break; 2486 default: 2487 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2488 } 2489 } 2490 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2491 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2492 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2493 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2494 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2495 PetscFunctionReturn(0); 2496 } 2497 2498 2499 /* 2500 Makes a longer coloring[] array and calls the usual code with that 2501 */ 2502 #undef __FUNCT__ 2503 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2504 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2505 { 2506 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2507 PetscErrorCode ierr; 2508 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2509 PetscInt *colorused,i; 2510 ISColoringValue *newcolor; 2511 2512 PetscFunctionBegin; 2513 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2514 /* loop over inodes, marking a color for each column*/ 2515 row = 0; 2516 for (i=0; i<m; i++){ 2517 for (j=0; j<ns[i]; j++) { 2518 newcolor[row++] = coloring[i] + j*ncolors; 2519 } 2520 } 2521 2522 /* eliminate unneeded colors */ 2523 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2524 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2525 for (i=0; i<n; i++) { 2526 colorused[newcolor[i]] = 1; 2527 } 2528 2529 for (i=1; i<5*ncolors; i++) { 2530 colorused[i] += colorused[i-1]; 2531 } 2532 ncolors = colorused[5*ncolors-1]; 2533 for (i=0; i<n; i++) { 2534 newcolor[i] = colorused[newcolor[i]]-1; 2535 } 2536 ierr = PetscFree(colorused);CHKERRQ(ierr); 2537 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2538 ierr = PetscFree(coloring);CHKERRQ(ierr); 2539 PetscFunctionReturn(0); 2540 } 2541 2542 #include "../src/mat/blockinvert.h" 2543 2544 #undef __FUNCT__ 2545 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2546 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2547 { 2548 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2549 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2550 MatScalar *ibdiag,*bdiag,work[25]; 2551 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2552 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2553 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2554 PetscErrorCode ierr; 2555 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2556 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2557 2558 PetscFunctionBegin; 2559 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2560 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2561 if (its > 1) { 2562 /* switch to non-inode version */ 2563 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2564 PetscFunctionReturn(0); 2565 } 2566 2567 if (!a->inode.ibdiagvalid) { 2568 if (!a->inode.ibdiag) { 2569 /* calculate space needed for diagonal blocks */ 2570 for (i=0; i<m; i++) { 2571 cnt += sizes[i]*sizes[i]; 2572 } 2573 a->inode.bdiagsize = cnt; 2574 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2575 } 2576 2577 /* copy over the diagonal blocks and invert them */ 2578 ibdiag = a->inode.ibdiag; 2579 bdiag = a->inode.bdiag; 2580 cnt = 0; 2581 for (i=0, row = 0; i<m; i++) { 2582 for (j=0; j<sizes[i]; j++) { 2583 for (k=0; k<sizes[i]; k++) { 2584 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2585 } 2586 } 2587 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2588 2589 switch(sizes[i]) { 2590 case 1: 2591 /* Create matrix data structure */ 2592 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2593 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2594 break; 2595 case 2: 2596 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2597 break; 2598 case 3: 2599 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2600 break; 2601 case 4: 2602 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2603 break; 2604 case 5: 2605 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2606 break; 2607 default: 2608 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2609 } 2610 cnt += sizes[i]*sizes[i]; 2611 row += sizes[i]; 2612 } 2613 a->inode.ibdiagvalid = PETSC_TRUE; 2614 } 2615 ibdiag = a->inode.ibdiag; 2616 bdiag = a->inode.bdiag; 2617 2618 2619 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2620 if (flag & SOR_ZERO_INITIAL_GUESS) { 2621 PetscScalar *b; 2622 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2623 if (xx != bb) { 2624 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2625 } else { 2626 b = x; 2627 } 2628 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2629 2630 for (i=0, row=0; i<m; i++) { 2631 sz = diag[row] - ii[row]; 2632 v1 = a->a + ii[row]; 2633 idx = a->j + ii[row]; 2634 2635 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2636 switch (sizes[i]){ 2637 case 1: 2638 2639 sum1 = b[row]; 2640 for(n = 0; n<sz-1; n+=2) { 2641 i1 = idx[0]; 2642 i2 = idx[1]; 2643 idx += 2; 2644 tmp0 = x[i1]; 2645 tmp1 = x[i2]; 2646 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2647 } 2648 2649 if (n == sz-1){ 2650 tmp0 = x[*idx]; 2651 sum1 -= *v1 * tmp0; 2652 } 2653 x[row++] = sum1*(*ibdiag++); 2654 break; 2655 case 2: 2656 v2 = a->a + ii[row+1]; 2657 sum1 = b[row]; 2658 sum2 = b[row+1]; 2659 for(n = 0; n<sz-1; n+=2) { 2660 i1 = idx[0]; 2661 i2 = idx[1]; 2662 idx += 2; 2663 tmp0 = x[i1]; 2664 tmp1 = x[i2]; 2665 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2666 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2667 } 2668 2669 if (n == sz-1){ 2670 tmp0 = x[*idx]; 2671 sum1 -= v1[0] * tmp0; 2672 sum2 -= v2[0] * tmp0; 2673 } 2674 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2675 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2676 ibdiag += 4; 2677 break; 2678 case 3: 2679 v2 = a->a + ii[row+1]; 2680 v3 = a->a + ii[row+2]; 2681 sum1 = b[row]; 2682 sum2 = b[row+1]; 2683 sum3 = b[row+2]; 2684 for(n = 0; n<sz-1; n+=2) { 2685 i1 = idx[0]; 2686 i2 = idx[1]; 2687 idx += 2; 2688 tmp0 = x[i1]; 2689 tmp1 = x[i2]; 2690 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2691 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2692 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2693 } 2694 2695 if (n == sz-1){ 2696 tmp0 = x[*idx]; 2697 sum1 -= v1[0] * tmp0; 2698 sum2 -= v2[0] * tmp0; 2699 sum3 -= v3[0] * tmp0; 2700 } 2701 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2702 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2703 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2704 ibdiag += 9; 2705 break; 2706 case 4: 2707 v2 = a->a + ii[row+1]; 2708 v3 = a->a + ii[row+2]; 2709 v4 = a->a + ii[row+3]; 2710 sum1 = b[row]; 2711 sum2 = b[row+1]; 2712 sum3 = b[row+2]; 2713 sum4 = b[row+3]; 2714 for(n = 0; n<sz-1; n+=2) { 2715 i1 = idx[0]; 2716 i2 = idx[1]; 2717 idx += 2; 2718 tmp0 = x[i1]; 2719 tmp1 = x[i2]; 2720 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2721 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2722 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2723 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2724 } 2725 2726 if (n == sz-1){ 2727 tmp0 = x[*idx]; 2728 sum1 -= v1[0] * tmp0; 2729 sum2 -= v2[0] * tmp0; 2730 sum3 -= v3[0] * tmp0; 2731 sum4 -= v4[0] * tmp0; 2732 } 2733 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2734 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2735 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2736 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2737 ibdiag += 16; 2738 break; 2739 case 5: 2740 v2 = a->a + ii[row+1]; 2741 v3 = a->a + ii[row+2]; 2742 v4 = a->a + ii[row+3]; 2743 v5 = a->a + ii[row+4]; 2744 sum1 = b[row]; 2745 sum2 = b[row+1]; 2746 sum3 = b[row+2]; 2747 sum4 = b[row+3]; 2748 sum5 = b[row+4]; 2749 for(n = 0; n<sz-1; n+=2) { 2750 i1 = idx[0]; 2751 i2 = idx[1]; 2752 idx += 2; 2753 tmp0 = x[i1]; 2754 tmp1 = x[i2]; 2755 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2756 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2757 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2758 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2759 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2760 } 2761 2762 if (n == sz-1){ 2763 tmp0 = x[*idx]; 2764 sum1 -= v1[0] * tmp0; 2765 sum2 -= v2[0] * tmp0; 2766 sum3 -= v3[0] * tmp0; 2767 sum4 -= v4[0] * tmp0; 2768 sum5 -= v5[0] * tmp0; 2769 } 2770 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2771 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2772 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2773 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2774 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2775 ibdiag += 25; 2776 break; 2777 default: 2778 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2779 } 2780 } 2781 2782 xb = x; 2783 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2784 } else xb = b; 2785 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2786 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2787 cnt = 0; 2788 for (i=0, row=0; i<m; i++) { 2789 2790 switch (sizes[i]){ 2791 case 1: 2792 x[row++] *= bdiag[cnt++]; 2793 break; 2794 case 2: 2795 x1 = x[row]; x2 = x[row+1]; 2796 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2797 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2798 x[row++] = tmp1; 2799 x[row++] = tmp2; 2800 cnt += 4; 2801 break; 2802 case 3: 2803 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2804 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2805 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2806 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2807 x[row++] = tmp1; 2808 x[row++] = tmp2; 2809 x[row++] = tmp3; 2810 cnt += 9; 2811 break; 2812 case 4: 2813 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2814 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2815 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2816 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2817 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2818 x[row++] = tmp1; 2819 x[row++] = tmp2; 2820 x[row++] = tmp3; 2821 x[row++] = tmp4; 2822 cnt += 16; 2823 break; 2824 case 5: 2825 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2826 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2827 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2828 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2829 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2830 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2831 x[row++] = tmp1; 2832 x[row++] = tmp2; 2833 x[row++] = tmp3; 2834 x[row++] = tmp4; 2835 x[row++] = tmp5; 2836 cnt += 25; 2837 break; 2838 default: 2839 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2840 } 2841 } 2842 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2843 } 2844 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2845 2846 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2847 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2848 ibdiag -= sizes[i]*sizes[i]; 2849 sz = ii[row+1] - diag[row] - 1; 2850 v1 = a->a + diag[row] + 1; 2851 idx = a->j + diag[row] + 1; 2852 2853 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2854 switch (sizes[i]){ 2855 case 1: 2856 2857 sum1 = xb[row]; 2858 for(n = 0; n<sz-1; n+=2) { 2859 i1 = idx[0]; 2860 i2 = idx[1]; 2861 idx += 2; 2862 tmp0 = x[i1]; 2863 tmp1 = x[i2]; 2864 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2865 } 2866 2867 if (n == sz-1){ 2868 tmp0 = x[*idx]; 2869 sum1 -= *v1*tmp0; 2870 } 2871 x[row--] = sum1*(*ibdiag); 2872 break; 2873 2874 case 2: 2875 2876 sum1 = xb[row]; 2877 sum2 = xb[row-1]; 2878 /* note that sum1 is associated with the second of the two rows */ 2879 v2 = a->a + diag[row-1] + 2; 2880 for(n = 0; n<sz-1; n+=2) { 2881 i1 = idx[0]; 2882 i2 = idx[1]; 2883 idx += 2; 2884 tmp0 = x[i1]; 2885 tmp1 = x[i2]; 2886 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2887 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2888 } 2889 2890 if (n == sz-1){ 2891 tmp0 = x[*idx]; 2892 sum1 -= *v1*tmp0; 2893 sum2 -= *v2*tmp0; 2894 } 2895 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2896 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2897 break; 2898 case 3: 2899 2900 sum1 = xb[row]; 2901 sum2 = xb[row-1]; 2902 sum3 = xb[row-2]; 2903 v2 = a->a + diag[row-1] + 2; 2904 v3 = a->a + diag[row-2] + 3; 2905 for(n = 0; n<sz-1; n+=2) { 2906 i1 = idx[0]; 2907 i2 = idx[1]; 2908 idx += 2; 2909 tmp0 = x[i1]; 2910 tmp1 = x[i2]; 2911 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2912 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2913 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2914 } 2915 2916 if (n == sz-1){ 2917 tmp0 = x[*idx]; 2918 sum1 -= *v1*tmp0; 2919 sum2 -= *v2*tmp0; 2920 sum3 -= *v3*tmp0; 2921 } 2922 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2923 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2924 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2925 break; 2926 case 4: 2927 2928 sum1 = xb[row]; 2929 sum2 = xb[row-1]; 2930 sum3 = xb[row-2]; 2931 sum4 = xb[row-3]; 2932 v2 = a->a + diag[row-1] + 2; 2933 v3 = a->a + diag[row-2] + 3; 2934 v4 = a->a + diag[row-3] + 4; 2935 for(n = 0; n<sz-1; n+=2) { 2936 i1 = idx[0]; 2937 i2 = idx[1]; 2938 idx += 2; 2939 tmp0 = x[i1]; 2940 tmp1 = x[i2]; 2941 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2942 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2943 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2944 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2945 } 2946 2947 if (n == sz-1){ 2948 tmp0 = x[*idx]; 2949 sum1 -= *v1*tmp0; 2950 sum2 -= *v2*tmp0; 2951 sum3 -= *v3*tmp0; 2952 sum4 -= *v4*tmp0; 2953 } 2954 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2955 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2956 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2957 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2958 break; 2959 case 5: 2960 2961 sum1 = xb[row]; 2962 sum2 = xb[row-1]; 2963 sum3 = xb[row-2]; 2964 sum4 = xb[row-3]; 2965 sum5 = xb[row-4]; 2966 v2 = a->a + diag[row-1] + 2; 2967 v3 = a->a + diag[row-2] + 3; 2968 v4 = a->a + diag[row-3] + 4; 2969 v5 = a->a + diag[row-4] + 5; 2970 for(n = 0; n<sz-1; n+=2) { 2971 i1 = idx[0]; 2972 i2 = idx[1]; 2973 idx += 2; 2974 tmp0 = x[i1]; 2975 tmp1 = x[i2]; 2976 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2977 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2978 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2979 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2980 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2981 } 2982 2983 if (n == sz-1){ 2984 tmp0 = x[*idx]; 2985 sum1 -= *v1*tmp0; 2986 sum2 -= *v2*tmp0; 2987 sum3 -= *v3*tmp0; 2988 sum4 -= *v4*tmp0; 2989 sum5 -= *v5*tmp0; 2990 } 2991 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2992 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2993 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2994 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2995 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2996 break; 2997 default: 2998 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2999 } 3000 } 3001 3002 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3003 } 3004 its--; 3005 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3006 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3007 } 3008 if (flag & SOR_EISENSTAT) { 3009 const PetscScalar *b; 3010 MatScalar *t = a->inode.ssor_work; 3011 3012 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3013 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3014 /* 3015 Apply (U + D)^-1 where D is now the block diagonal 3016 */ 3017 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3018 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3019 ibdiag -= sizes[i]*sizes[i]; 3020 sz = ii[row+1] - diag[row] - 1; 3021 v1 = a->a + diag[row] + 1; 3022 idx = a->j + diag[row] + 1; 3023 CHKMEMQ; 3024 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3025 switch (sizes[i]){ 3026 case 1: 3027 3028 sum1 = b[row]; 3029 for(n = 0; n<sz-1; n+=2) { 3030 i1 = idx[0]; 3031 i2 = idx[1]; 3032 idx += 2; 3033 tmp0 = x[i1]; 3034 tmp1 = x[i2]; 3035 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3036 } 3037 3038 if (n == sz-1){ 3039 tmp0 = x[*idx]; 3040 sum1 -= *v1*tmp0; 3041 } 3042 x[row] = sum1*(*ibdiag);row--; 3043 break; 3044 3045 case 2: 3046 3047 sum1 = b[row]; 3048 sum2 = b[row-1]; 3049 /* note that sum1 is associated with the second of the two rows */ 3050 v2 = a->a + diag[row-1] + 2; 3051 for(n = 0; n<sz-1; n+=2) { 3052 i1 = idx[0]; 3053 i2 = idx[1]; 3054 idx += 2; 3055 tmp0 = x[i1]; 3056 tmp1 = x[i2]; 3057 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3058 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3059 } 3060 3061 if (n == sz-1){ 3062 tmp0 = x[*idx]; 3063 sum1 -= *v1*tmp0; 3064 sum2 -= *v2*tmp0; 3065 } 3066 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3067 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3068 row -= 2; 3069 break; 3070 case 3: 3071 3072 sum1 = b[row]; 3073 sum2 = b[row-1]; 3074 sum3 = b[row-2]; 3075 v2 = a->a + diag[row-1] + 2; 3076 v3 = a->a + diag[row-2] + 3; 3077 for(n = 0; n<sz-1; n+=2) { 3078 i1 = idx[0]; 3079 i2 = idx[1]; 3080 idx += 2; 3081 tmp0 = x[i1]; 3082 tmp1 = x[i2]; 3083 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3084 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3085 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3086 } 3087 3088 if (n == sz-1){ 3089 tmp0 = x[*idx]; 3090 sum1 -= *v1*tmp0; 3091 sum2 -= *v2*tmp0; 3092 sum3 -= *v3*tmp0; 3093 } 3094 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3095 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3096 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3097 row -= 3; 3098 break; 3099 case 4: 3100 3101 sum1 = b[row]; 3102 sum2 = b[row-1]; 3103 sum3 = b[row-2]; 3104 sum4 = b[row-3]; 3105 v2 = a->a + diag[row-1] + 2; 3106 v3 = a->a + diag[row-2] + 3; 3107 v4 = a->a + diag[row-3] + 4; 3108 for(n = 0; n<sz-1; n+=2) { 3109 i1 = idx[0]; 3110 i2 = idx[1]; 3111 idx += 2; 3112 tmp0 = x[i1]; 3113 tmp1 = x[i2]; 3114 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3115 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3116 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3117 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3118 } 3119 3120 if (n == sz-1){ 3121 tmp0 = x[*idx]; 3122 sum1 -= *v1*tmp0; 3123 sum2 -= *v2*tmp0; 3124 sum3 -= *v3*tmp0; 3125 sum4 -= *v4*tmp0; 3126 } 3127 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3128 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3129 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3130 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3131 row -= 4; 3132 break; 3133 case 5: 3134 3135 sum1 = b[row]; 3136 sum2 = b[row-1]; 3137 sum3 = b[row-2]; 3138 sum4 = b[row-3]; 3139 sum5 = b[row-4]; 3140 v2 = a->a + diag[row-1] + 2; 3141 v3 = a->a + diag[row-2] + 3; 3142 v4 = a->a + diag[row-3] + 4; 3143 v5 = a->a + diag[row-4] + 5; 3144 for(n = 0; n<sz-1; n+=2) { 3145 i1 = idx[0]; 3146 i2 = idx[1]; 3147 idx += 2; 3148 tmp0 = x[i1]; 3149 tmp1 = x[i2]; 3150 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3151 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3152 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3153 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3154 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3155 } 3156 3157 if (n == sz-1){ 3158 tmp0 = x[*idx]; 3159 sum1 -= *v1*tmp0; 3160 sum2 -= *v2*tmp0; 3161 sum3 -= *v3*tmp0; 3162 sum4 -= *v4*tmp0; 3163 sum5 -= *v5*tmp0; 3164 } 3165 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3166 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3167 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3168 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3169 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3170 row -= 5; 3171 break; 3172 default: 3173 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3174 } 3175 CHKMEMQ; 3176 } 3177 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3178 3179 /* 3180 t = b - D x where D is the block diagonal 3181 */ 3182 cnt = 0; 3183 for (i=0, row=0; i<m; i++) { 3184 CHKMEMQ; 3185 switch (sizes[i]){ 3186 case 1: 3187 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3188 break; 3189 case 2: 3190 x1 = x[row]; x2 = x[row+1]; 3191 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3192 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3193 t[row] = b[row] - tmp1; 3194 t[row+1] = b[row+1] - tmp2; row += 2; 3195 cnt += 4; 3196 break; 3197 case 3: 3198 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3199 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3200 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3201 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3202 t[row] = b[row] - tmp1; 3203 t[row+1] = b[row+1] - tmp2; 3204 t[row+2] = b[row+2] - tmp3; row += 3; 3205 cnt += 9; 3206 break; 3207 case 4: 3208 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3209 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3210 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3211 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3212 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3213 t[row] = b[row] - tmp1; 3214 t[row+1] = b[row+1] - tmp2; 3215 t[row+2] = b[row+2] - tmp3; 3216 t[row+3] = b[row+3] - tmp4; row += 4; 3217 cnt += 16; 3218 break; 3219 case 5: 3220 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3221 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3222 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3223 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3224 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3225 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3226 t[row] = b[row] - tmp1; 3227 t[row+1] = b[row+1] - tmp2; 3228 t[row+2] = b[row+2] - tmp3; 3229 t[row+3] = b[row+3] - tmp4; 3230 t[row+4] = b[row+4] - tmp5;row += 5; 3231 cnt += 25; 3232 break; 3233 default: 3234 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3235 } 3236 CHKMEMQ; 3237 } 3238 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3239 3240 3241 3242 /* 3243 Apply (L + D)^-1 where D is the block diagonal 3244 */ 3245 for (i=0, row=0; i<m; i++) { 3246 sz = diag[row] - ii[row]; 3247 v1 = a->a + ii[row]; 3248 idx = a->j + ii[row]; 3249 CHKMEMQ; 3250 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3251 switch (sizes[i]){ 3252 case 1: 3253 3254 sum1 = t[row]; 3255 for(n = 0; n<sz-1; n+=2) { 3256 i1 = idx[0]; 3257 i2 = idx[1]; 3258 idx += 2; 3259 tmp0 = t[i1]; 3260 tmp1 = t[i2]; 3261 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3262 } 3263 3264 if (n == sz-1){ 3265 tmp0 = t[*idx]; 3266 sum1 -= *v1 * tmp0; 3267 } 3268 x[row] += t[row] = sum1*(*ibdiag++); row++; 3269 break; 3270 case 2: 3271 v2 = a->a + ii[row+1]; 3272 sum1 = t[row]; 3273 sum2 = t[row+1]; 3274 for(n = 0; n<sz-1; n+=2) { 3275 i1 = idx[0]; 3276 i2 = idx[1]; 3277 idx += 2; 3278 tmp0 = t[i1]; 3279 tmp1 = t[i2]; 3280 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3281 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3282 } 3283 3284 if (n == sz-1){ 3285 tmp0 = t[*idx]; 3286 sum1 -= v1[0] * tmp0; 3287 sum2 -= v2[0] * tmp0; 3288 } 3289 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3290 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3291 ibdiag += 4; row += 2; 3292 break; 3293 case 3: 3294 v2 = a->a + ii[row+1]; 3295 v3 = a->a + ii[row+2]; 3296 sum1 = t[row]; 3297 sum2 = t[row+1]; 3298 sum3 = t[row+2]; 3299 for(n = 0; n<sz-1; n+=2) { 3300 i1 = idx[0]; 3301 i2 = idx[1]; 3302 idx += 2; 3303 tmp0 = t[i1]; 3304 tmp1 = t[i2]; 3305 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3306 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3307 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3308 } 3309 3310 if (n == sz-1){ 3311 tmp0 = t[*idx]; 3312 sum1 -= v1[0] * tmp0; 3313 sum2 -= v2[0] * tmp0; 3314 sum3 -= v3[0] * tmp0; 3315 } 3316 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3317 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3318 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3319 ibdiag += 9; row += 3; 3320 break; 3321 case 4: 3322 v2 = a->a + ii[row+1]; 3323 v3 = a->a + ii[row+2]; 3324 v4 = a->a + ii[row+3]; 3325 sum1 = t[row]; 3326 sum2 = t[row+1]; 3327 sum3 = t[row+2]; 3328 sum4 = t[row+3]; 3329 for(n = 0; n<sz-1; n+=2) { 3330 i1 = idx[0]; 3331 i2 = idx[1]; 3332 idx += 2; 3333 tmp0 = t[i1]; 3334 tmp1 = t[i2]; 3335 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3336 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3337 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3338 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3339 } 3340 3341 if (n == sz-1){ 3342 tmp0 = t[*idx]; 3343 sum1 -= v1[0] * tmp0; 3344 sum2 -= v2[0] * tmp0; 3345 sum3 -= v3[0] * tmp0; 3346 sum4 -= v4[0] * tmp0; 3347 } 3348 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3349 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3350 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3351 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3352 ibdiag += 16; row += 4; 3353 break; 3354 case 5: 3355 v2 = a->a + ii[row+1]; 3356 v3 = a->a + ii[row+2]; 3357 v4 = a->a + ii[row+3]; 3358 v5 = a->a + ii[row+4]; 3359 sum1 = t[row]; 3360 sum2 = t[row+1]; 3361 sum3 = t[row+2]; 3362 sum4 = t[row+3]; 3363 sum5 = t[row+4]; 3364 for(n = 0; n<sz-1; n+=2) { 3365 i1 = idx[0]; 3366 i2 = idx[1]; 3367 idx += 2; 3368 tmp0 = t[i1]; 3369 tmp1 = t[i2]; 3370 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3371 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3372 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3373 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3374 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3375 } 3376 3377 if (n == sz-1){ 3378 tmp0 = t[*idx]; 3379 sum1 -= v1[0] * tmp0; 3380 sum2 -= v2[0] * tmp0; 3381 sum3 -= v3[0] * tmp0; 3382 sum4 -= v4[0] * tmp0; 3383 sum5 -= v5[0] * tmp0; 3384 } 3385 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3386 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3387 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3388 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3389 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3390 ibdiag += 25; row += 5; 3391 break; 3392 default: 3393 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3394 } 3395 CHKMEMQ; 3396 } 3397 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3398 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3399 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3400 } 3401 PetscFunctionReturn(0); 3402 } 3403 3404 #undef __FUNCT__ 3405 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3406 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3407 { 3408 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3409 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3410 const MatScalar *bdiag = a->inode.bdiag; 3411 const PetscScalar *b; 3412 PetscErrorCode ierr; 3413 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3414 const PetscInt *sizes = a->inode.size; 3415 3416 PetscFunctionBegin; 3417 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3418 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3419 cnt = 0; 3420 for (i=0, row=0; i<m; i++) { 3421 switch (sizes[i]){ 3422 case 1: 3423 x[row] = b[row]*bdiag[cnt++];row++; 3424 break; 3425 case 2: 3426 x1 = b[row]; x2 = b[row+1]; 3427 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3428 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3429 x[row++] = tmp1; 3430 x[row++] = tmp2; 3431 cnt += 4; 3432 break; 3433 case 3: 3434 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3435 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3436 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3437 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3438 x[row++] = tmp1; 3439 x[row++] = tmp2; 3440 x[row++] = tmp3; 3441 cnt += 9; 3442 break; 3443 case 4: 3444 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3445 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3446 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3447 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3448 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3449 x[row++] = tmp1; 3450 x[row++] = tmp2; 3451 x[row++] = tmp3; 3452 x[row++] = tmp4; 3453 cnt += 16; 3454 break; 3455 case 5: 3456 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3457 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3458 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3459 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3460 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3461 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3462 x[row++] = tmp1; 3463 x[row++] = tmp2; 3464 x[row++] = tmp3; 3465 x[row++] = tmp4; 3466 x[row++] = tmp5; 3467 cnt += 25; 3468 break; 3469 default: 3470 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3471 } 3472 } 3473 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3474 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3475 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3476 PetscFunctionReturn(0); 3477 } 3478 3479 /* 3480 samestructure indicates that the matrix has not changed its nonzero structure so we 3481 do not need to recompute the inodes 3482 */ 3483 #undef __FUNCT__ 3484 #define __FUNCT__ "Mat_CheckInode" 3485 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3486 { 3487 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3488 PetscErrorCode ierr; 3489 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3490 PetscTruth flag; 3491 3492 PetscFunctionBegin; 3493 if (!a->inode.use) PetscFunctionReturn(0); 3494 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3495 3496 3497 m = A->rmap->n; 3498 if (a->inode.size) {ns = a->inode.size;} 3499 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3500 3501 i = 0; 3502 node_count = 0; 3503 idx = a->j; 3504 ii = a->i; 3505 while (i < m){ /* For each row */ 3506 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3507 /* Limits the number of elements in a node to 'a->inode.limit' */ 3508 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3509 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3510 if (nzy != nzx) break; 3511 idy += nzx; /* Same nonzero pattern */ 3512 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3513 if (!flag) break; 3514 } 3515 ns[node_count++] = blk_size; 3516 idx += blk_size*nzx; 3517 i = j; 3518 } 3519 /* If not enough inodes found,, do not use inode version of the routines */ 3520 if (!a->inode.size && m && node_count > .9*m) { 3521 ierr = PetscFree(ns);CHKERRQ(ierr); 3522 a->inode.node_count = 0; 3523 a->inode.size = PETSC_NULL; 3524 a->inode.use = PETSC_FALSE; 3525 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3526 } else { 3527 A->ops->mult = MatMult_SeqAIJ_Inode; 3528 A->ops->sor = MatSOR_SeqAIJ_Inode; 3529 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3530 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3531 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3532 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3533 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3534 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3535 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3536 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode_inplace; 3537 a->inode.node_count = node_count; 3538 a->inode.size = ns; 3539 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3540 } 3541 PetscFunctionReturn(0); 3542 } 3543 3544 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3545 PetscInt __k, *__vi; \ 3546 __vi = aj + ai[row]; \ 3547 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3548 __vi = aj + adiag[row]; \ 3549 cols[nzl] = __vi[0];\ 3550 __vi = aj + adiag[row+1]+1;\ 3551 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3552 3553 3554 /* 3555 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3556 Modified from Mat_CheckInode(). 3557 3558 Input Parameters: 3559 + Mat A - ILU or LU matrix factor 3560 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3561 do not need to recompute the inodes 3562 */ 3563 #undef __FUNCT__ 3564 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3565 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3566 { 3567 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3568 PetscErrorCode ierr; 3569 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3570 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3571 PetscTruth flag; 3572 3573 PetscFunctionBegin; 3574 if (!a->inode.use) PetscFunctionReturn(0); 3575 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3576 3577 m = A->rmap->n; 3578 if (a->inode.size) {ns = a->inode.size;} 3579 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3580 3581 i = 0; 3582 node_count = 0; 3583 while (i < m){ /* For each row */ 3584 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3585 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3586 nzx = nzl1 + nzu1 + 1; 3587 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3588 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3589 3590 /* Limits the number of elements in a node to 'a->inode.limit' */ 3591 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3592 nzl2 = ai[j+1] - ai[j]; 3593 nzu2 = adiag[j] - adiag[j+1] - 1; 3594 nzy = nzl2 + nzu2 + 1; 3595 if( nzy != nzx) break; 3596 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3597 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3598 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3599 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3600 ierr = PetscFree(cols2);CHKERRQ(ierr); 3601 } 3602 ns[node_count++] = blk_size; 3603 ierr = PetscFree(cols1);CHKERRQ(ierr); 3604 i = j; 3605 } 3606 /* If not enough inodes found,, do not use inode version of the routines */ 3607 if (!a->inode.size && m && node_count > .9*m) { 3608 ierr = PetscFree(ns);CHKERRQ(ierr); 3609 a->inode.node_count = 0; 3610 a->inode.size = PETSC_NULL; 3611 a->inode.use = PETSC_FALSE; 3612 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3613 } else { 3614 A->ops->mult = 0; 3615 A->ops->sor = 0; 3616 A->ops->multadd = 0; 3617 A->ops->getrowij = 0; 3618 A->ops->restorerowij = 0; 3619 A->ops->getcolumnij = 0; 3620 A->ops->restorecolumnij = 0; 3621 A->ops->coloringpatch = 0; 3622 A->ops->multdiagonalblock = 0; 3623 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */ 3624 a->inode.node_count = node_count; 3625 a->inode.size = ns; 3626 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3627 } 3628 PetscFunctionReturn(0); 3629 } 3630 3631 /* 3632 This is really ugly. if inodes are used this replaces the 3633 permutations with ones that correspond to rows/cols of the matrix 3634 rather then inode blocks 3635 */ 3636 #undef __FUNCT__ 3637 #define __FUNCT__ "MatInodeAdjustForInodes" 3638 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3639 { 3640 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3641 3642 PetscFunctionBegin; 3643 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3644 if (f) { 3645 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3646 } 3647 PetscFunctionReturn(0); 3648 } 3649 3650 EXTERN_C_BEGIN 3651 #undef __FUNCT__ 3652 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3653 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3654 { 3655 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3656 PetscErrorCode ierr; 3657 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3658 const PetscInt *ridx,*cidx; 3659 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3660 PetscInt nslim_col,*ns_col; 3661 IS ris = *rperm,cis = *cperm; 3662 3663 PetscFunctionBegin; 3664 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3665 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3666 3667 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3668 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3669 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3670 3671 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3672 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3673 3674 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3675 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3676 3677 /* Construct the permutations for rows*/ 3678 for (i=0,row = 0; i<nslim_row; ++i){ 3679 indx = ridx[i]; 3680 start_val = tns[indx]; 3681 end_val = tns[indx + 1]; 3682 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3683 } 3684 3685 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3686 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3687 3688 /* Construct permutations for columns */ 3689 for (i=0,col=0; i<nslim_col; ++i){ 3690 indx = cidx[i]; 3691 start_val = tns[indx]; 3692 end_val = tns[indx + 1]; 3693 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3694 } 3695 3696 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3697 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3698 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3699 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3700 3701 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3702 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3703 3704 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3705 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3706 ierr = ISDestroy(cis);CHKERRQ(ierr); 3707 ierr = ISDestroy(ris);CHKERRQ(ierr); 3708 ierr = PetscFree(tns);CHKERRQ(ierr); 3709 PetscFunctionReturn(0); 3710 } 3711 EXTERN_C_END 3712 3713 #undef __FUNCT__ 3714 #define __FUNCT__ "MatInodeGetInodeSizes" 3715 /*@C 3716 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3717 3718 Collective on Mat 3719 3720 Input Parameter: 3721 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3722 3723 Output Parameter: 3724 + node_count - no of inodes present in the matrix. 3725 . sizes - an array of size node_count,with sizes of each inode. 3726 - limit - the max size used to generate the inodes. 3727 3728 Level: advanced 3729 3730 Notes: This routine returns some internal storage information 3731 of the matrix, it is intended to be used by advanced users. 3732 It should be called after the matrix is assembled. 3733 The contents of the sizes[] array should not be changed. 3734 PETSC_NULL may be passed for information not requested. 3735 3736 .keywords: matrix, seqaij, get, inode 3737 3738 .seealso: MatGetInfo() 3739 @*/ 3740 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3741 { 3742 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3743 3744 PetscFunctionBegin; 3745 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3746 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3747 if (f) { 3748 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3749 } 3750 PetscFunctionReturn(0); 3751 } 3752 3753 EXTERN_C_BEGIN 3754 #undef __FUNCT__ 3755 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3756 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3757 { 3758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3759 3760 PetscFunctionBegin; 3761 if (node_count) *node_count = a->inode.node_count; 3762 if (sizes) *sizes = a->inode.size; 3763 if (limit) *limit = a->inode.limit; 3764 PetscFunctionReturn(0); 3765 } 3766 EXTERN_C_END 3767