1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *rtmp,*pc,*pc1,*pc2,multiplier,multiplier1,multiplier2,*pv,*rtmp11,*rtmp22,*rtmp33; 1194 const MatScalar *aa=a->a,*v,*v1,*v2; 1195 FactorShiftCtx sctx; 1196 PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,kk1,kk2; 1200 1201 1202 PetscFunctionBegin; 1203 printf("MatLUFactorNumeric_SeqAIJ_Inode...\n"); 1204 /* MatPivotSetUp(): initialize shift context sctx */ 1205 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1206 1207 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1208 ddiag = a->diag; 1209 sctx.shift_top = info->zeropivot; 1210 for (i=0; i<n; i++) { 1211 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1212 d = (aa)[ddiag[i]]; 1213 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1214 v = aa+ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 for (j=0; j<nz; j++) 1217 rs += PetscAbsScalar(v[j]); 1218 if (rs>sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 //ierr = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1229 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 rtmp = rtmp11; 1232 rtmp22 = rtmp11 + n; 1233 rtmp33 = rtmp22 + n; 1234 ics = ic; 1235 1236 PetscInt node_max,*ns; 1237 node_max = a->inode.node_count; 1238 ns = a->inode.size; 1239 if (!ns){ 1240 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1241 } 1242 1243 do { 1244 sctx.useshift = PETSC_FALSE; 1245 /* Now loop over each block-row, and do the factorization */ 1246 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1247 //for (i=0; i<n; i++){ 1248 nodesz = ns[inod]; 1249 //printf("row %d, inod %d, nodesz %d\n",i,inod,nodesz); 1250 1251 switch (nodesz){ 1252 case 1: 1253 /* zero rtmp */ 1254 /* L part */ 1255 nz = bi[i+1] - bi[i]; 1256 bjtmp = bj + bi[i]; 1257 for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0; 1258 1259 /* U part */ 1260 nz = bdiag[i]-bdiag[i+1]; 1261 bjtmp = bj + bdiag[i+1]+1; 1262 for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0; 1263 1264 /* load in initial (unfactored row) */ 1265 nz = ai[r[i]+1] - ai[r[i]]; 1266 ajtmp = aj + ai[r[i]]; 1267 v = aa + ai[r[i]]; 1268 for (j=0; j<nz; j++) { 1269 rtmp[ics[ajtmp[j]]] = v[j]; 1270 } 1271 /* ZeropivotApply() */ 1272 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1273 1274 /* elimination */ 1275 bjtmp = bj + bi[i]; 1276 row = *bjtmp++; 1277 nzL = bi[i+1] - bi[i]; 1278 for(k=0; k < nzL;k++) { 1279 pc = rtmp + row; 1280 if (*pc != 0.0) { 1281 pv = b->a + bdiag[row]; 1282 multiplier = *pc * (*pv); 1283 *pc = multiplier; 1284 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1285 pv = b->a + bdiag[row+1]+1; 1286 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1287 for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 1288 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1289 } 1290 row = *bjtmp++; 1291 } 1292 1293 /* finished row so stick it into b->a */ 1294 rs = 0.0; 1295 /* L part */ 1296 pv = b->a + bi[i] ; 1297 pj = b->j + bi[i] ; 1298 nz = bi[i+1] - bi[i]; 1299 for (j=0; j<nz; j++) { 1300 pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]); 1301 } 1302 1303 /* U part */ 1304 pv = b->a + bdiag[i+1]+1; 1305 pj = b->j + bdiag[i+1]+1; 1306 nz = bdiag[i] - bdiag[i+1]-1; 1307 for (j=0; j<nz; j++) { 1308 pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]); 1309 } 1310 1311 /* MatPivotCheck() */ 1312 sctx.rs = rs; 1313 sctx.pv = rtmp[i]; 1314 if (info->shifttype == MAT_SHIFT_NONZERO){ 1315 ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr); 1316 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1317 ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr); 1318 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1319 ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr); 1320 } else { 1321 ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr); 1322 } 1323 rtmp[i] = sctx.pv; 1324 1325 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1326 pv = b->a + bdiag[i]; 1327 *pv = 1.0/rtmp[i]; 1328 1329 break; 1330 1331 case 2: 1332 /*----------*/ 1333 /* zero rtmp */ 1334 /* L part */ 1335 nz = bi[i+1] - bi[i]; 1336 bjtmp = bj + bi[i]; 1337 for (j=0; j<nz; j++) { 1338 rtmp11[bjtmp[j]] = 0.0; rtmp22[bjtmp[j]] = 0.0; 1339 } 1340 1341 /* U part */ 1342 nz = bdiag[i]-bdiag[i+1]; 1343 bjtmp = bj + bdiag[i+1]+1; 1344 for (j=0; j<nz; j++) { 1345 rtmp11[bjtmp[j]] = 0.0; rtmp22[bjtmp[j]] = 0.0; 1346 } 1347 1348 /* load in initial (unfactored row) */ 1349 nz = ai[r[i]+1] - ai[r[i]]; 1350 ajtmp = aj + ai[r[i]]; 1351 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1352 for (j=0; j<nz; j++) { 1353 rtmp11[ics[ajtmp[j]]] = v1[j]; rtmp22[ics[ajtmp[j]]] = v2[j]; 1354 } 1355 /* ZeropivotApply(): shift the diagonal of the matrix */ 1356 rtmp11[i] += sctx.shift_amount; rtmp22[i+1] += sctx.shift_amount; 1357 1358 /* elimination */ 1359 bjtmp = bj + bi[i]; 1360 row = *bjtmp++; /* pivot row */ 1361 nzL = bi[i+1] - bi[i]; 1362 for(k=0; k < nzL;k++) { 1363 1364 pc1 = rtmp11 + row; 1365 pc2 = rtmp22 + row; 1366 if (*pc1 != 0.0 || *pc2 != 0.0) { 1367 pv = b->a + bdiag[row]; 1368 multiplier1 = *pc1*(*pv); multiplier2 = *pc2*(*pv); 1369 *pc1 = multiplier1; *pc2 = multiplier2; 1370 1371 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1372 pv = b->a + bdiag[row+1]+1; 1373 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1374 for (j=0; j<nz; j++){ 1375 rtmp11[pj[j]] -= multiplier1 * pv[j]; 1376 rtmp22[pj[j]] -= multiplier2 * pv[j]; 1377 } 1378 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1379 } 1380 row = *bjtmp++; 1381 } 1382 1383 /* Now take care of diagonal 2x2 block. */ 1384 pc1 = rtmp11 + i; 1385 pc2 = rtmp22 + i; 1386 1387 if (*pc2 != 0.0){ 1388 multiplier = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1389 *pc2 = multiplier; /* insert L entry */ 1390 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1391 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1392 for (j=0; j<nz; j++) { 1393 rtmp22[pj[j]] -= multiplier * rtmp11[pj[j]]; 1394 } 1395 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1396 } 1397 1398 1399 /* finished rows so stick it into b->a */ 1400 rs = 0.0; 1401 /* L part */ 1402 kk1 = kk2 =0; 1403 pc1 = b->a + bi[i]; pc2 = b->a + bi[i+1]; 1404 pj = b->j + bi[i] ; 1405 nz = bi[i+1] - bi[i]; 1406 for (j=0; j<nz; j++) { 1407 if (pj[j] < i) {pc1[kk1] = rtmp11[pj[j]]; kk1++; rs += PetscAbsScalar(pc1[j]);} 1408 if (pj[j] < i+1) pc2[kk2] = rtmp22[pj[j]]; kk2++; 1409 //printf("%d, rtmp11 %g, rtmp22 = %g\n",pj[j],rtmp11[pj[j]],rtmp22[pj[j]]); 1410 } 1411 1412 /* U part */ 1413 pc1 = b->a + bdiag[i+1]+1; pc2 = b->a + bdiag[i+2]+1; 1414 pj = b->j + bdiag[i+1]+1; 1415 nz = bdiag[i] - bdiag[i+1]; // inlcude diagonal 1416 1417 kk1 = kk2 =0; 1418 for (j=0; j<nz; j++) { 1419 if (pj[j]>i) { 1420 pc1[kk1] = rtmp11[pj[j]]; rs += PetscAbsScalar(pc1[j]); 1421 kk1++; 1422 } 1423 if (pj[j] > i+1){ 1424 pc2[kk2] = rtmp22[pj[j]]; kk2++; 1425 //printf("%d, rtmp11 %g, rtmp22 = %g\n",pj[j],rtmp11[pj[j]],rtmp22[pj[j]]); 1426 } 1427 } 1428 1429 #if defined(TMP) 1430 /* MatPivotCheck() */ 1431 sctx.rs = rs; 1432 sctx.pv = rtmp11[i]; 1433 if (info->shifttype == MAT_SHIFT_NONZERO){ 1434 ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr); 1435 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1436 ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr); 1437 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1438 ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr); 1439 } else { 1440 ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr); 1441 } 1442 rtmp11[i] = sctx.pv; 1443 #endif 1444 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1445 pc1 = b->a + bdiag[i]; 1446 *pc1 = 1.0/rtmp11[i]; 1447 #if defined(TMP) 1448 sctx.rs = rs; 1449 sctx.pv = rtmp22[i+1]; 1450 if (info->shifttype == MAT_SHIFT_NONZERO){ 1451 ierr = MatPivotCheck_nz(info,sctx,i+1);CHKERRQ(ierr); 1452 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1453 ierr = MatPivotCheck_pd(info,sctx,i+1);CHKERRQ(ierr); 1454 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1455 ierr = MatPivotCheck_inblocks(info,sctx,i+1);CHKERRQ(ierr); 1456 } else { 1457 ierr = MatPivotCheck_none(info,sctx,i+1);CHKERRQ(ierr); 1458 } 1459 rtmp22[i+1] = sctx.pv; 1460 #endif 1461 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1462 pc2 = b->a + bdiag[i+1]; 1463 *pc2 = 1.0/rtmp22[i+1]; 1464 #if defined(MV) 1465 // L part 1466 printf("row %d: ",i+1); 1467 for (j=b->i[i+1]; j<b->i[i+1+1]; j++) { 1468 printf( "(%d, %g), ",b->j[j],b->a[j]); 1469 } 1470 /* diagonal */ 1471 j = b->diag[i+1]; 1472 printf( "(%d, %g), ",b->j[j],b->a[j]); 1473 // U 1474 for (j=b->diag[i+1+1]+1; j<b->diag[i+1]; j++) { 1475 printf( "(%d, %g), ",b->j[j],b->a[j]); 1476 } 1477 printf("\n"); 1478 #endif 1479 break; 1480 default: 1481 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1482 } 1483 i += nodesz; /* Update the row */ 1484 1485 } /* endof for (i=0; i<n; i++){ */ 1486 1487 /* MatPivotRefine() */ 1488 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1489 /* 1490 * if no shift in this attempt & shifting & started shifting & can refine, 1491 * then try lower shift 1492 */ 1493 sctx.shift_hi = sctx.shift_fraction; 1494 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1495 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1496 sctx.useshift = PETSC_TRUE; 1497 sctx.nshift++; 1498 } 1499 } while (sctx.useshift); 1500 1501 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1502 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1503 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1504 1505 C->ops->solve = MatSolve_SeqAIJ_Inode; 1506 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1507 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1508 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1509 C->ops->matsolve = MatMatSolve_SeqAIJ; 1510 C->assembled = PETSC_TRUE; 1511 C->preallocated = PETSC_TRUE; 1512 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1513 1514 /* MatShiftView(A,info,&sctx) */ 1515 if (sctx.nshift){ 1516 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1517 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1518 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1519 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1520 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1521 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1522 } 1523 } 1524 PetscFunctionReturn(0); 1525 } 1526 1527 #undef __FUNCT__ 1528 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1529 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1530 { 1531 Mat C = B; 1532 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1533 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1534 PetscErrorCode ierr; 1535 const PetscInt *r,*ic,*c,*ics; 1536 PetscInt n = A->rmap->n,*bi = b->i; 1537 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1538 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1539 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1540 PetscScalar mul1,mul2,mul3,tmp; 1541 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1542 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1543 PetscReal rs=0.0; 1544 FactorShiftCtx sctx; 1545 PetscInt newshift; 1546 1547 PetscFunctionBegin; 1548 sctx.shift_top = 0; 1549 sctx.nshift_max = 0; 1550 sctx.shift_lo = 0; 1551 sctx.shift_hi = 0; 1552 sctx.shift_fraction = 0; 1553 1554 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1555 if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1556 sctx.shift_top = 0; 1557 for (i=0; i<n; i++) { 1558 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1559 rs = 0.0; 1560 ajtmp = aj + ai[i]; 1561 rtmp1 = aa + ai[i]; 1562 nz = ai[i+1] - ai[i]; 1563 for (j=0; j<nz; j++){ 1564 if (*ajtmp != i){ 1565 rs += PetscAbsScalar(*rtmp1++); 1566 } else { 1567 rs -= PetscRealPart(*rtmp1++); 1568 } 1569 ajtmp++; 1570 } 1571 if (rs>sctx.shift_top) sctx.shift_top = rs; 1572 } 1573 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1574 sctx.shift_top *= 1.1; 1575 sctx.nshift_max = 5; 1576 sctx.shift_lo = 0.; 1577 sctx.shift_hi = 1.; 1578 } 1579 sctx.shift_amount = 0; 1580 sctx.nshift = 0; 1581 1582 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1583 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1584 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1585 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1586 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1587 ics = ic ; 1588 rtmp22 = rtmp11 + n; 1589 rtmp33 = rtmp22 + n; 1590 1591 node_max = a->inode.node_count; 1592 ns = a->inode.size; 1593 if (!ns){ 1594 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1595 } 1596 1597 /* If max inode size > 3, split it into two inodes.*/ 1598 /* also map the inode sizes according to the ordering */ 1599 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1600 for (i=0,j=0; i<node_max; ++i,++j){ 1601 if (ns[i]>3) { 1602 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1603 ++j; 1604 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1605 } else { 1606 tmp_vec1[j] = ns[i]; 1607 } 1608 } 1609 /* Use the correct node_max */ 1610 node_max = j; 1611 1612 /* Now reorder the inode info based on mat re-ordering info */ 1613 /* First create a row -> inode_size_array_index map */ 1614 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1615 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1616 for (i=0,row=0; i<node_max; i++) { 1617 nodesz = tmp_vec1[i]; 1618 for (j=0; j<nodesz; j++,row++) { 1619 nsmap[row] = i; 1620 } 1621 } 1622 /* Using nsmap, create a reordered ns structure */ 1623 for (i=0,j=0; i< node_max; i++) { 1624 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1625 tmp_vec2[i] = nodesz; 1626 j += nodesz; 1627 } 1628 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1629 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1630 /* Now use the correct ns */ 1631 ns = tmp_vec2; 1632 1633 do { 1634 sctx.useshift = PETSC_FALSE; 1635 /* Now loop over each block-row, and do the factorization */ 1636 for (i=0,row=0; i<node_max; i++) { 1637 nodesz = ns[i]; 1638 nz = bi[row+1] - bi[row]; 1639 bjtmp = bj + bi[row]; 1640 1641 switch (nodesz){ 1642 case 1: 1643 for (j=0; j<nz; j++){ 1644 idx = bjtmp[j]; 1645 rtmp11[idx] = 0.0; 1646 } 1647 1648 /* load in initial (unfactored row) */ 1649 idx = r[row]; 1650 nz_tmp = ai[idx+1] - ai[idx]; 1651 ajtmp = aj + ai[idx]; 1652 v1 = aa + ai[idx]; 1653 1654 for (j=0; j<nz_tmp; j++) { 1655 idx = ics[ajtmp[j]]; 1656 rtmp11[idx] = v1[j]; 1657 } 1658 rtmp11[ics[r[row]]] += sctx.shift_amount; 1659 1660 prow = *bjtmp++ ; 1661 while (prow < row) { 1662 pc1 = rtmp11 + prow; 1663 if (*pc1 != 0.0){ 1664 pv = ba + bd[prow]; 1665 pj = nbj + bd[prow]; 1666 mul1 = *pc1 * *pv++; 1667 *pc1 = mul1; 1668 nz_tmp = bi[prow+1] - bd[prow] - 1; 1669 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1670 for (j=0; j<nz_tmp; j++) { 1671 tmp = pv[j]; 1672 idx = pj[j]; 1673 rtmp11[idx] -= mul1 * tmp; 1674 } 1675 } 1676 prow = *bjtmp++ ; 1677 } 1678 pj = bj + bi[row]; 1679 pc1 = ba + bi[row]; 1680 1681 sctx.pv = rtmp11[row]; 1682 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1683 rs = 0.0; 1684 for (j=0; j<nz; j++) { 1685 idx = pj[j]; 1686 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1687 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1688 } 1689 sctx.rs = rs; 1690 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1691 if (newshift == 1) goto endofwhile; 1692 break; 1693 1694 case 2: 1695 for (j=0; j<nz; j++) { 1696 idx = bjtmp[j]; 1697 rtmp11[idx] = 0.0; 1698 rtmp22[idx] = 0.0; 1699 } 1700 1701 /* load in initial (unfactored row) */ 1702 idx = r[row]; 1703 nz_tmp = ai[idx+1] - ai[idx]; 1704 ajtmp = aj + ai[idx]; 1705 v1 = aa + ai[idx]; 1706 v2 = aa + ai[idx+1]; 1707 for (j=0; j<nz_tmp; j++) { 1708 idx = ics[ajtmp[j]]; 1709 rtmp11[idx] = v1[j]; 1710 rtmp22[idx] = v2[j]; 1711 } 1712 rtmp11[ics[r[row]]] += sctx.shift_amount; 1713 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1714 1715 prow = *bjtmp++ ; 1716 while (prow < row) { 1717 pc1 = rtmp11 + prow; 1718 pc2 = rtmp22 + prow; 1719 if (*pc1 != 0.0 || *pc2 != 0.0){ 1720 pv = ba + bd[prow]; 1721 pj = nbj + bd[prow]; 1722 mul1 = *pc1 * *pv; 1723 mul2 = *pc2 * *pv; 1724 ++pv; 1725 *pc1 = mul1; 1726 *pc2 = mul2; 1727 1728 nz_tmp = bi[prow+1] - bd[prow] - 1; 1729 for (j=0; j<nz_tmp; j++) { 1730 tmp = pv[j]; 1731 idx = pj[j]; 1732 rtmp11[idx] -= mul1 * tmp; 1733 rtmp22[idx] -= mul2 * tmp; 1734 } 1735 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1736 } 1737 prow = *bjtmp++ ; 1738 } 1739 1740 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1741 pc1 = rtmp11 + prow; 1742 pc2 = rtmp22 + prow; 1743 1744 sctx.pv = *pc1; 1745 pj = bj + bi[prow]; 1746 rs = 0.0; 1747 for (j=0; j<nz; j++){ 1748 idx = pj[j]; 1749 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1750 } 1751 sctx.rs = rs; 1752 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1753 if (newshift == 1) goto endofwhile; 1754 1755 if (*pc2 != 0.0){ 1756 pj = nbj + bd[prow]; 1757 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1758 *pc2 = mul2; 1759 nz_tmp = bi[prow+1] - bd[prow] - 1; 1760 for (j=0; j<nz_tmp; j++) { 1761 idx = pj[j] ; 1762 tmp = rtmp11[idx]; 1763 rtmp22[idx] -= mul2 * tmp; 1764 } 1765 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1766 } 1767 1768 pj = bj + bi[row]; 1769 pc1 = ba + bi[row]; 1770 pc2 = ba + bi[row+1]; 1771 1772 sctx.pv = rtmp22[row+1]; 1773 rs = 0.0; 1774 rtmp11[row] = 1.0/rtmp11[row]; 1775 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1776 /* copy row entries from dense representation to sparse */ 1777 for (j=0; j<nz; j++) { 1778 idx = pj[j]; 1779 pc1[j] = rtmp11[idx]; 1780 pc2[j] = rtmp22[idx]; 1781 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1782 } 1783 sctx.rs = rs; 1784 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1785 if (newshift == 1) goto endofwhile; 1786 break; 1787 1788 case 3: 1789 for (j=0; j<nz; j++) { 1790 idx = bjtmp[j]; 1791 rtmp11[idx] = 0.0; 1792 rtmp22[idx] = 0.0; 1793 rtmp33[idx] = 0.0; 1794 } 1795 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1796 idx = r[row]; 1797 nz_tmp = ai[idx+1] - ai[idx]; 1798 ajtmp = aj + ai[idx]; 1799 v1 = aa + ai[idx]; 1800 v2 = aa + ai[idx+1]; 1801 v3 = aa + ai[idx+2]; 1802 for (j=0; j<nz_tmp; j++) { 1803 idx = ics[ajtmp[j]]; 1804 rtmp11[idx] = v1[j]; 1805 rtmp22[idx] = v2[j]; 1806 rtmp33[idx] = v3[j]; 1807 } 1808 rtmp11[ics[r[row]]] += sctx.shift_amount; 1809 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1810 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1811 1812 /* loop over all pivot row blocks above this row block */ 1813 prow = *bjtmp++ ; 1814 while (prow < row) { 1815 pc1 = rtmp11 + prow; 1816 pc2 = rtmp22 + prow; 1817 pc3 = rtmp33 + prow; 1818 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1819 pv = ba + bd[prow]; 1820 pj = nbj + bd[prow]; 1821 mul1 = *pc1 * *pv; 1822 mul2 = *pc2 * *pv; 1823 mul3 = *pc3 * *pv; 1824 ++pv; 1825 *pc1 = mul1; 1826 *pc2 = mul2; 1827 *pc3 = mul3; 1828 1829 nz_tmp = bi[prow+1] - bd[prow] - 1; 1830 /* update this row based on pivot row */ 1831 for (j=0; j<nz_tmp; j++) { 1832 tmp = pv[j]; 1833 idx = pj[j]; 1834 rtmp11[idx] -= mul1 * tmp; 1835 rtmp22[idx] -= mul2 * tmp; 1836 rtmp33[idx] -= mul3 * tmp; 1837 } 1838 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1839 } 1840 prow = *bjtmp++ ; 1841 } 1842 1843 /* Now take care of diagonal 3x3 block in this set of rows */ 1844 /* note: prow = row here */ 1845 pc1 = rtmp11 + prow; 1846 pc2 = rtmp22 + prow; 1847 pc3 = rtmp33 + prow; 1848 1849 sctx.pv = *pc1; 1850 pj = bj + bi[prow]; 1851 rs = 0.0; 1852 for (j=0; j<nz; j++){ 1853 idx = pj[j]; 1854 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1855 } 1856 sctx.rs = rs; 1857 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1858 if (newshift == 1) goto endofwhile; 1859 1860 if (*pc2 != 0.0 || *pc3 != 0.0){ 1861 mul2 = (*pc2)/(*pc1); 1862 mul3 = (*pc3)/(*pc1); 1863 *pc2 = mul2; 1864 *pc3 = mul3; 1865 nz_tmp = bi[prow+1] - bd[prow] - 1; 1866 pj = nbj + bd[prow]; 1867 for (j=0; j<nz_tmp; j++) { 1868 idx = pj[j] ; 1869 tmp = rtmp11[idx]; 1870 rtmp22[idx] -= mul2 * tmp; 1871 rtmp33[idx] -= mul3 * tmp; 1872 } 1873 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1874 } 1875 ++prow; 1876 1877 pc2 = rtmp22 + prow; 1878 pc3 = rtmp33 + prow; 1879 sctx.pv = *pc2; 1880 pj = bj + bi[prow]; 1881 rs = 0.0; 1882 for (j=0; j<nz; j++){ 1883 idx = pj[j]; 1884 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1885 } 1886 sctx.rs = rs; 1887 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1888 if (newshift == 1) goto endofwhile; 1889 1890 if (*pc3 != 0.0){ 1891 mul3 = (*pc3)/(*pc2); 1892 *pc3 = mul3; 1893 pj = nbj + bd[prow]; 1894 nz_tmp = bi[prow+1] - bd[prow] - 1; 1895 for (j=0; j<nz_tmp; j++) { 1896 idx = pj[j] ; 1897 tmp = rtmp22[idx]; 1898 rtmp33[idx] -= mul3 * tmp; 1899 } 1900 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1901 } 1902 1903 pj = bj + bi[row]; 1904 pc1 = ba + bi[row]; 1905 pc2 = ba + bi[row+1]; 1906 pc3 = ba + bi[row+2]; 1907 1908 sctx.pv = rtmp33[row+2]; 1909 rs = 0.0; 1910 rtmp11[row] = 1.0/rtmp11[row]; 1911 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1912 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1913 /* copy row entries from dense representation to sparse */ 1914 for (j=0; j<nz; j++) { 1915 idx = pj[j]; 1916 pc1[j] = rtmp11[idx]; 1917 pc2[j] = rtmp22[idx]; 1918 pc3[j] = rtmp33[idx]; 1919 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1920 } 1921 1922 sctx.rs = rs; 1923 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1924 if (newshift == 1) goto endofwhile; 1925 break; 1926 1927 default: 1928 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1929 } 1930 row += nodesz; /* Update the row */ 1931 } 1932 endofwhile:; 1933 } while (sctx.useshift); 1934 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1935 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1936 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1937 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1938 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1939 (B)->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 1940 /* do not set solve add, since MatSolve_Inode + Add is faster */ 1941 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 1942 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 1943 C->assembled = PETSC_TRUE; 1944 C->preallocated = PETSC_TRUE; 1945 if (sctx.nshift) { 1946 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1947 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1948 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1949 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1950 } 1951 } 1952 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1953 PetscFunctionReturn(0); 1954 } 1955 1956 1957 /* ----------------------------------------------------------- */ 1958 #undef __FUNCT__ 1959 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 1960 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 1961 { 1962 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1963 IS iscol = a->col,isrow = a->row; 1964 PetscErrorCode ierr; 1965 const PetscInt *r,*c,*rout,*cout; 1966 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 1967 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 1968 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 1969 PetscScalar sum1,sum2,sum3,sum4,sum5; 1970 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 1971 const PetscScalar *b; 1972 1973 PetscFunctionBegin; 1974 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 1975 node_max = a->inode.node_count; 1976 ns = a->inode.size; /* Node Size array */ 1977 1978 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1980 tmp = a->solve_work; 1981 1982 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1983 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1984 1985 /* forward solve the lower triangular */ 1986 tmps = tmp ; 1987 aa = a_a ; 1988 aj = a_j ; 1989 ad = a->diag; 1990 1991 for (i = 0,row = 0; i< node_max; ++i){ 1992 nsz = ns[i]; 1993 aii = ai[row]; 1994 v1 = aa + aii; 1995 vi = aj + aii; 1996 nz = ai[row+1]- ai[row]; 1997 1998 if (i < node_max-1) { 1999 /* Prefetch the indices for the next block */ 2000 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2001 /* Prefetch the data for the next block */ 2002 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2003 } 2004 2005 switch (nsz){ /* Each loop in 'case' is unrolled */ 2006 case 1 : 2007 sum1 = b[r[row]]; 2008 for(j=0; j<nz-1; j+=2){ 2009 i0 = vi[j]; 2010 i1 = vi[j+1]; 2011 tmp0 = tmps[i0]; 2012 tmp1 = tmps[i1]; 2013 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2014 } 2015 if(j == nz-1){ 2016 tmp0 = tmps[vi[j]]; 2017 sum1 -= v1[j]*tmp0; 2018 } 2019 tmp[row++]=sum1; 2020 break; 2021 case 2: 2022 sum1 = b[r[row]]; 2023 sum2 = b[r[row+1]]; 2024 v2 = aa + ai[row+1]; 2025 2026 for(j=0; j<nz-1; j+=2){ 2027 i0 = vi[j]; 2028 i1 = vi[j+1]; 2029 tmp0 = tmps[i0]; 2030 tmp1 = tmps[i1]; 2031 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2032 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2033 } 2034 if(j == nz-1){ 2035 tmp0 = tmps[vi[j]]; 2036 sum1 -= v1[j] *tmp0; 2037 sum2 -= v2[j] *tmp0; 2038 } 2039 sum2 -= v2[nz] * sum1; 2040 tmp[row ++]=sum1; 2041 tmp[row ++]=sum2; 2042 break; 2043 case 3: 2044 sum1 = b[r[row]]; 2045 sum2 = b[r[row+1]]; 2046 sum3 = b[r[row+2]]; 2047 v2 = aa + ai[row+1]; 2048 v3 = aa + ai[row+2]; 2049 2050 for (j=0; j<nz-1; j+=2){ 2051 i0 = vi[j]; 2052 i1 = vi[j+1]; 2053 tmp0 = tmps[i0]; 2054 tmp1 = tmps[i1]; 2055 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2056 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2057 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2058 } 2059 if (j == nz-1){ 2060 tmp0 = tmps[vi[j]]; 2061 sum1 -= v1[j] *tmp0; 2062 sum2 -= v2[j] *tmp0; 2063 sum3 -= v3[j] *tmp0; 2064 } 2065 sum2 -= v2[nz] * sum1; 2066 sum3 -= v3[nz] * sum1; 2067 sum3 -= v3[nz+1] * sum2; 2068 tmp[row ++]=sum1; 2069 tmp[row ++]=sum2; 2070 tmp[row ++]=sum3; 2071 break; 2072 2073 case 4: 2074 sum1 = b[r[row]]; 2075 sum2 = b[r[row+1]]; 2076 sum3 = b[r[row+2]]; 2077 sum4 = b[r[row+3]]; 2078 v2 = aa + ai[row+1]; 2079 v3 = aa + ai[row+2]; 2080 v4 = aa + ai[row+3]; 2081 2082 for (j=0; j<nz-1; j+=2){ 2083 i0 = vi[j]; 2084 i1 = vi[j+1]; 2085 tmp0 = tmps[i0]; 2086 tmp1 = tmps[i1]; 2087 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2088 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2089 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2090 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2091 } 2092 if (j == nz-1){ 2093 tmp0 = tmps[vi[j]]; 2094 sum1 -= v1[j] *tmp0; 2095 sum2 -= v2[j] *tmp0; 2096 sum3 -= v3[j] *tmp0; 2097 sum4 -= v4[j] *tmp0; 2098 } 2099 sum2 -= v2[nz] * sum1; 2100 sum3 -= v3[nz] * sum1; 2101 sum4 -= v4[nz] * sum1; 2102 sum3 -= v3[nz+1] * sum2; 2103 sum4 -= v4[nz+1] * sum2; 2104 sum4 -= v4[nz+2] * sum3; 2105 2106 tmp[row ++]=sum1; 2107 tmp[row ++]=sum2; 2108 tmp[row ++]=sum3; 2109 tmp[row ++]=sum4; 2110 break; 2111 case 5: 2112 sum1 = b[r[row]]; 2113 sum2 = b[r[row+1]]; 2114 sum3 = b[r[row+2]]; 2115 sum4 = b[r[row+3]]; 2116 sum5 = b[r[row+4]]; 2117 v2 = aa + ai[row+1]; 2118 v3 = aa + ai[row+2]; 2119 v4 = aa + ai[row+3]; 2120 v5 = aa + ai[row+4]; 2121 2122 for (j=0; j<nz-1; j+=2){ 2123 i0 = vi[j]; 2124 i1 = vi[j+1]; 2125 tmp0 = tmps[i0]; 2126 tmp1 = tmps[i1]; 2127 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2128 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2129 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2130 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2131 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2132 } 2133 if (j == nz-1){ 2134 tmp0 = tmps[vi[j]]; 2135 sum1 -= v1[j] *tmp0; 2136 sum2 -= v2[j] *tmp0; 2137 sum3 -= v3[j] *tmp0; 2138 sum4 -= v4[j] *tmp0; 2139 sum5 -= v5[j] *tmp0; 2140 } 2141 2142 sum2 -= v2[nz] * sum1; 2143 sum3 -= v3[nz] * sum1; 2144 sum4 -= v4[nz] * sum1; 2145 sum5 -= v5[nz] * sum1; 2146 sum3 -= v3[nz+1] * sum2; 2147 sum4 -= v4[nz+1] * sum2; 2148 sum5 -= v5[nz+1] * sum2; 2149 sum4 -= v4[nz+2] * sum3; 2150 sum5 -= v5[nz+2] * sum3; 2151 sum5 -= v5[nz+3] * sum4; 2152 2153 tmp[row ++]=sum1; 2154 tmp[row ++]=sum2; 2155 tmp[row ++]=sum3; 2156 tmp[row ++]=sum4; 2157 tmp[row ++]=sum5; 2158 break; 2159 default: 2160 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2161 } 2162 } 2163 /* backward solve the upper triangular */ 2164 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2165 nsz = ns[i]; 2166 aii = ad[row+1] + 1; 2167 v1 = aa + aii; 2168 vi = aj + aii; 2169 nz = ad[row]- ad[row+1] - 1; 2170 2171 if (i > 0) { 2172 /* Prefetch the indices for the next block */ 2173 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2174 /* Prefetch the data for the next block */ 2175 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2176 } 2177 2178 switch (nsz){ /* Each loop in 'case' is unrolled */ 2179 case 1 : 2180 sum1 = tmp[row]; 2181 2182 for(j=0 ; j<nz-1; j+=2){ 2183 i0 = vi[j]; 2184 i1 = vi[j+1]; 2185 tmp0 = tmps[i0]; 2186 tmp1 = tmps[i1]; 2187 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2188 } 2189 if (j == nz-1){ 2190 tmp0 = tmps[vi[j]]; 2191 sum1 -= v1[j]*tmp0; 2192 } 2193 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2194 break; 2195 case 2 : 2196 sum1 = tmp[row]; 2197 sum2 = tmp[row-1]; 2198 v2 = aa + ad[row] + 1; 2199 for (j=0 ; j<nz-1; j+=2){ 2200 i0 = vi[j]; 2201 i1 = vi[j+1]; 2202 tmp0 = tmps[i0]; 2203 tmp1 = tmps[i1]; 2204 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2205 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2206 } 2207 if (j == nz-1){ 2208 tmp0 = tmps[vi[j]]; 2209 sum1 -= v1[j]* tmp0; 2210 sum2 -= v2[j+1]* tmp0; 2211 } 2212 2213 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2214 sum2 -= v2[0] * tmp0; 2215 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2216 break; 2217 case 3 : 2218 sum1 = tmp[row]; 2219 sum2 = tmp[row -1]; 2220 sum3 = tmp[row -2]; 2221 v2 = aa + ad[row] + 1; 2222 v3 = aa + ad[row -1] + 1; 2223 for (j=0 ; j<nz-1; j+=2){ 2224 i0 = vi[j]; 2225 i1 = vi[j+1]; 2226 tmp0 = tmps[i0]; 2227 tmp1 = tmps[i1]; 2228 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2229 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2230 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2231 } 2232 if (j== nz-1){ 2233 tmp0 = tmps[vi[j]]; 2234 sum1 -= v1[j] * tmp0; 2235 sum2 -= v2[j+1] * tmp0; 2236 sum3 -= v3[j+2] * tmp0; 2237 } 2238 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2239 sum2 -= v2[0]* tmp0; 2240 sum3 -= v3[1] * tmp0; 2241 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2242 sum3 -= v3[0]* tmp0; 2243 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2244 2245 break; 2246 case 4 : 2247 sum1 = tmp[row]; 2248 sum2 = tmp[row -1]; 2249 sum3 = tmp[row -2]; 2250 sum4 = tmp[row -3]; 2251 v2 = aa + ad[row]+1; 2252 v3 = aa + ad[row -1]+1; 2253 v4 = aa + ad[row -2]+1; 2254 2255 for (j=0 ; j<nz-1; j+=2){ 2256 i0 = vi[j]; 2257 i1 = vi[j+1]; 2258 tmp0 = tmps[i0]; 2259 tmp1 = tmps[i1]; 2260 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2261 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2262 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2263 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2264 } 2265 if (j== nz-1){ 2266 tmp0 = tmps[vi[j]]; 2267 sum1 -= v1[j] * tmp0; 2268 sum2 -= v2[j+1] * tmp0; 2269 sum3 -= v3[j+2] * tmp0; 2270 sum4 -= v4[j+3] * tmp0; 2271 } 2272 2273 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2274 sum2 -= v2[0] * tmp0; 2275 sum3 -= v3[1] * tmp0; 2276 sum4 -= v4[2] * tmp0; 2277 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2278 sum3 -= v3[0] * tmp0; 2279 sum4 -= v4[1] * tmp0; 2280 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2281 sum4 -= v4[0] * tmp0; 2282 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2283 break; 2284 case 5 : 2285 sum1 = tmp[row]; 2286 sum2 = tmp[row -1]; 2287 sum3 = tmp[row -2]; 2288 sum4 = tmp[row -3]; 2289 sum5 = tmp[row -4]; 2290 v2 = aa + ad[row]+1; 2291 v3 = aa + ad[row -1]+1; 2292 v4 = aa + ad[row -2]+1; 2293 v5 = aa + ad[row -3]+1; 2294 for (j=0 ; j<nz-1; j+=2){ 2295 i0 = vi[j]; 2296 i1 = vi[j+1]; 2297 tmp0 = tmps[i0]; 2298 tmp1 = tmps[i1]; 2299 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2300 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2301 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2302 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2303 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2304 } 2305 if (j==nz-1){ 2306 tmp0 = tmps[vi[j]]; 2307 sum1 -= v1[j] * tmp0; 2308 sum2 -= v2[j+1] * tmp0; 2309 sum3 -= v3[j+2] * tmp0; 2310 sum4 -= v4[j+3] * tmp0; 2311 sum5 -= v5[j+4] * tmp0; 2312 } 2313 2314 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2315 sum2 -= v2[0] * tmp0; 2316 sum3 -= v3[1] * tmp0; 2317 sum4 -= v4[2] * tmp0; 2318 sum5 -= v5[3] * tmp0; 2319 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2320 sum3 -= v3[0] * tmp0; 2321 sum4 -= v4[1] * tmp0; 2322 sum5 -= v5[2] * tmp0; 2323 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2324 sum4 -= v4[0] * tmp0; 2325 sum5 -= v5[1] * tmp0; 2326 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2327 sum5 -= v5[0] * tmp0; 2328 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2329 break; 2330 default: 2331 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2332 } 2333 } 2334 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2335 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2336 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2337 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2338 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2339 PetscFunctionReturn(0); 2340 } 2341 2342 2343 /* 2344 Makes a longer coloring[] array and calls the usual code with that 2345 */ 2346 #undef __FUNCT__ 2347 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2348 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2349 { 2350 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2351 PetscErrorCode ierr; 2352 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2353 PetscInt *colorused,i; 2354 ISColoringValue *newcolor; 2355 2356 PetscFunctionBegin; 2357 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2358 /* loop over inodes, marking a color for each column*/ 2359 row = 0; 2360 for (i=0; i<m; i++){ 2361 for (j=0; j<ns[i]; j++) { 2362 newcolor[row++] = coloring[i] + j*ncolors; 2363 } 2364 } 2365 2366 /* eliminate unneeded colors */ 2367 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2368 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2369 for (i=0; i<n; i++) { 2370 colorused[newcolor[i]] = 1; 2371 } 2372 2373 for (i=1; i<5*ncolors; i++) { 2374 colorused[i] += colorused[i-1]; 2375 } 2376 ncolors = colorused[5*ncolors-1]; 2377 for (i=0; i<n; i++) { 2378 newcolor[i] = colorused[newcolor[i]]-1; 2379 } 2380 ierr = PetscFree(colorused);CHKERRQ(ierr); 2381 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2382 ierr = PetscFree(coloring);CHKERRQ(ierr); 2383 PetscFunctionReturn(0); 2384 } 2385 2386 #include "../src/mat/blockinvert.h" 2387 2388 #undef __FUNCT__ 2389 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2390 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2391 { 2392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2394 MatScalar *ibdiag,*bdiag,work[25]; 2395 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2396 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2397 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2398 PetscErrorCode ierr; 2399 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2400 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2401 2402 PetscFunctionBegin; 2403 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2404 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2405 if (its > 1) { 2406 /* switch to non-inode version */ 2407 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2408 PetscFunctionReturn(0); 2409 } 2410 2411 if (!a->inode.ibdiagvalid) { 2412 if (!a->inode.ibdiag) { 2413 /* calculate space needed for diagonal blocks */ 2414 for (i=0; i<m; i++) { 2415 cnt += sizes[i]*sizes[i]; 2416 } 2417 a->inode.bdiagsize = cnt; 2418 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2419 } 2420 2421 /* copy over the diagonal blocks and invert them */ 2422 ibdiag = a->inode.ibdiag; 2423 bdiag = a->inode.bdiag; 2424 cnt = 0; 2425 for (i=0, row = 0; i<m; i++) { 2426 for (j=0; j<sizes[i]; j++) { 2427 for (k=0; k<sizes[i]; k++) { 2428 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2429 } 2430 } 2431 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2432 2433 switch(sizes[i]) { 2434 case 1: 2435 /* Create matrix data structure */ 2436 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2437 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2438 break; 2439 case 2: 2440 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2441 break; 2442 case 3: 2443 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2444 break; 2445 case 4: 2446 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2447 break; 2448 case 5: 2449 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2450 break; 2451 default: 2452 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2453 } 2454 cnt += sizes[i]*sizes[i]; 2455 row += sizes[i]; 2456 } 2457 a->inode.ibdiagvalid = PETSC_TRUE; 2458 } 2459 ibdiag = a->inode.ibdiag; 2460 bdiag = a->inode.bdiag; 2461 2462 2463 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2464 if (flag & SOR_ZERO_INITIAL_GUESS) { 2465 PetscScalar *b; 2466 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2467 if (xx != bb) { 2468 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2469 } else { 2470 b = x; 2471 } 2472 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2473 2474 for (i=0, row=0; i<m; i++) { 2475 sz = diag[row] - ii[row]; 2476 v1 = a->a + ii[row]; 2477 idx = a->j + ii[row]; 2478 2479 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2480 switch (sizes[i]){ 2481 case 1: 2482 2483 sum1 = b[row]; 2484 for(n = 0; n<sz-1; n+=2) { 2485 i1 = idx[0]; 2486 i2 = idx[1]; 2487 idx += 2; 2488 tmp0 = x[i1]; 2489 tmp1 = x[i2]; 2490 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2491 } 2492 2493 if (n == sz-1){ 2494 tmp0 = x[*idx]; 2495 sum1 -= *v1 * tmp0; 2496 } 2497 x[row++] = sum1*(*ibdiag++); 2498 break; 2499 case 2: 2500 v2 = a->a + ii[row+1]; 2501 sum1 = b[row]; 2502 sum2 = b[row+1]; 2503 for(n = 0; n<sz-1; n+=2) { 2504 i1 = idx[0]; 2505 i2 = idx[1]; 2506 idx += 2; 2507 tmp0 = x[i1]; 2508 tmp1 = x[i2]; 2509 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2510 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2511 } 2512 2513 if (n == sz-1){ 2514 tmp0 = x[*idx]; 2515 sum1 -= v1[0] * tmp0; 2516 sum2 -= v2[0] * tmp0; 2517 } 2518 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2519 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2520 ibdiag += 4; 2521 break; 2522 case 3: 2523 v2 = a->a + ii[row+1]; 2524 v3 = a->a + ii[row+2]; 2525 sum1 = b[row]; 2526 sum2 = b[row+1]; 2527 sum3 = b[row+2]; 2528 for(n = 0; n<sz-1; n+=2) { 2529 i1 = idx[0]; 2530 i2 = idx[1]; 2531 idx += 2; 2532 tmp0 = x[i1]; 2533 tmp1 = x[i2]; 2534 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2535 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2536 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2537 } 2538 2539 if (n == sz-1){ 2540 tmp0 = x[*idx]; 2541 sum1 -= v1[0] * tmp0; 2542 sum2 -= v2[0] * tmp0; 2543 sum3 -= v3[0] * tmp0; 2544 } 2545 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2546 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2547 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2548 ibdiag += 9; 2549 break; 2550 case 4: 2551 v2 = a->a + ii[row+1]; 2552 v3 = a->a + ii[row+2]; 2553 v4 = a->a + ii[row+3]; 2554 sum1 = b[row]; 2555 sum2 = b[row+1]; 2556 sum3 = b[row+2]; 2557 sum4 = b[row+3]; 2558 for(n = 0; n<sz-1; n+=2) { 2559 i1 = idx[0]; 2560 i2 = idx[1]; 2561 idx += 2; 2562 tmp0 = x[i1]; 2563 tmp1 = x[i2]; 2564 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2565 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2566 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2567 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2568 } 2569 2570 if (n == sz-1){ 2571 tmp0 = x[*idx]; 2572 sum1 -= v1[0] * tmp0; 2573 sum2 -= v2[0] * tmp0; 2574 sum3 -= v3[0] * tmp0; 2575 sum4 -= v4[0] * tmp0; 2576 } 2577 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2578 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2579 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2580 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2581 ibdiag += 16; 2582 break; 2583 case 5: 2584 v2 = a->a + ii[row+1]; 2585 v3 = a->a + ii[row+2]; 2586 v4 = a->a + ii[row+3]; 2587 v5 = a->a + ii[row+4]; 2588 sum1 = b[row]; 2589 sum2 = b[row+1]; 2590 sum3 = b[row+2]; 2591 sum4 = b[row+3]; 2592 sum5 = b[row+4]; 2593 for(n = 0; n<sz-1; n+=2) { 2594 i1 = idx[0]; 2595 i2 = idx[1]; 2596 idx += 2; 2597 tmp0 = x[i1]; 2598 tmp1 = x[i2]; 2599 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2600 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2601 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2602 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2603 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2604 } 2605 2606 if (n == sz-1){ 2607 tmp0 = x[*idx]; 2608 sum1 -= v1[0] * tmp0; 2609 sum2 -= v2[0] * tmp0; 2610 sum3 -= v3[0] * tmp0; 2611 sum4 -= v4[0] * tmp0; 2612 sum5 -= v5[0] * tmp0; 2613 } 2614 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2615 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2616 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2617 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2618 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2619 ibdiag += 25; 2620 break; 2621 default: 2622 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2623 } 2624 } 2625 2626 xb = x; 2627 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2628 } else xb = b; 2629 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2630 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2631 cnt = 0; 2632 for (i=0, row=0; i<m; i++) { 2633 2634 switch (sizes[i]){ 2635 case 1: 2636 x[row++] *= bdiag[cnt++]; 2637 break; 2638 case 2: 2639 x1 = x[row]; x2 = x[row+1]; 2640 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2641 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2642 x[row++] = tmp1; 2643 x[row++] = tmp2; 2644 cnt += 4; 2645 break; 2646 case 3: 2647 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2648 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2649 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2650 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2651 x[row++] = tmp1; 2652 x[row++] = tmp2; 2653 x[row++] = tmp3; 2654 cnt += 9; 2655 break; 2656 case 4: 2657 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2658 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2659 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2660 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2661 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2662 x[row++] = tmp1; 2663 x[row++] = tmp2; 2664 x[row++] = tmp3; 2665 x[row++] = tmp4; 2666 cnt += 16; 2667 break; 2668 case 5: 2669 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2670 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2671 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2672 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2673 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2674 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2675 x[row++] = tmp1; 2676 x[row++] = tmp2; 2677 x[row++] = tmp3; 2678 x[row++] = tmp4; 2679 x[row++] = tmp5; 2680 cnt += 25; 2681 break; 2682 default: 2683 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2684 } 2685 } 2686 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2687 } 2688 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2689 2690 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2691 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2692 ibdiag -= sizes[i]*sizes[i]; 2693 sz = ii[row+1] - diag[row] - 1; 2694 v1 = a->a + diag[row] + 1; 2695 idx = a->j + diag[row] + 1; 2696 2697 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2698 switch (sizes[i]){ 2699 case 1: 2700 2701 sum1 = xb[row]; 2702 for(n = 0; n<sz-1; n+=2) { 2703 i1 = idx[0]; 2704 i2 = idx[1]; 2705 idx += 2; 2706 tmp0 = x[i1]; 2707 tmp1 = x[i2]; 2708 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2709 } 2710 2711 if (n == sz-1){ 2712 tmp0 = x[*idx]; 2713 sum1 -= *v1*tmp0; 2714 } 2715 x[row--] = sum1*(*ibdiag); 2716 break; 2717 2718 case 2: 2719 2720 sum1 = xb[row]; 2721 sum2 = xb[row-1]; 2722 /* note that sum1 is associated with the second of the two rows */ 2723 v2 = a->a + diag[row-1] + 2; 2724 for(n = 0; n<sz-1; n+=2) { 2725 i1 = idx[0]; 2726 i2 = idx[1]; 2727 idx += 2; 2728 tmp0 = x[i1]; 2729 tmp1 = x[i2]; 2730 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2731 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2732 } 2733 2734 if (n == sz-1){ 2735 tmp0 = x[*idx]; 2736 sum1 -= *v1*tmp0; 2737 sum2 -= *v2*tmp0; 2738 } 2739 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2740 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2741 break; 2742 case 3: 2743 2744 sum1 = xb[row]; 2745 sum2 = xb[row-1]; 2746 sum3 = xb[row-2]; 2747 v2 = a->a + diag[row-1] + 2; 2748 v3 = a->a + diag[row-2] + 3; 2749 for(n = 0; n<sz-1; n+=2) { 2750 i1 = idx[0]; 2751 i2 = idx[1]; 2752 idx += 2; 2753 tmp0 = x[i1]; 2754 tmp1 = x[i2]; 2755 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2756 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2757 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2758 } 2759 2760 if (n == sz-1){ 2761 tmp0 = x[*idx]; 2762 sum1 -= *v1*tmp0; 2763 sum2 -= *v2*tmp0; 2764 sum3 -= *v3*tmp0; 2765 } 2766 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2767 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2768 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2769 break; 2770 case 4: 2771 2772 sum1 = xb[row]; 2773 sum2 = xb[row-1]; 2774 sum3 = xb[row-2]; 2775 sum4 = xb[row-3]; 2776 v2 = a->a + diag[row-1] + 2; 2777 v3 = a->a + diag[row-2] + 3; 2778 v4 = a->a + diag[row-3] + 4; 2779 for(n = 0; n<sz-1; n+=2) { 2780 i1 = idx[0]; 2781 i2 = idx[1]; 2782 idx += 2; 2783 tmp0 = x[i1]; 2784 tmp1 = x[i2]; 2785 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2786 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2787 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2788 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2789 } 2790 2791 if (n == sz-1){ 2792 tmp0 = x[*idx]; 2793 sum1 -= *v1*tmp0; 2794 sum2 -= *v2*tmp0; 2795 sum3 -= *v3*tmp0; 2796 sum4 -= *v4*tmp0; 2797 } 2798 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2799 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2800 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2801 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2802 break; 2803 case 5: 2804 2805 sum1 = xb[row]; 2806 sum2 = xb[row-1]; 2807 sum3 = xb[row-2]; 2808 sum4 = xb[row-3]; 2809 sum5 = xb[row-4]; 2810 v2 = a->a + diag[row-1] + 2; 2811 v3 = a->a + diag[row-2] + 3; 2812 v4 = a->a + diag[row-3] + 4; 2813 v5 = a->a + diag[row-4] + 5; 2814 for(n = 0; n<sz-1; n+=2) { 2815 i1 = idx[0]; 2816 i2 = idx[1]; 2817 idx += 2; 2818 tmp0 = x[i1]; 2819 tmp1 = x[i2]; 2820 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2821 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2822 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2823 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2824 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2825 } 2826 2827 if (n == sz-1){ 2828 tmp0 = x[*idx]; 2829 sum1 -= *v1*tmp0; 2830 sum2 -= *v2*tmp0; 2831 sum3 -= *v3*tmp0; 2832 sum4 -= *v4*tmp0; 2833 sum5 -= *v5*tmp0; 2834 } 2835 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2836 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2837 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2838 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2839 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2840 break; 2841 default: 2842 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2843 } 2844 } 2845 2846 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2847 } 2848 its--; 2849 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2850 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2851 } 2852 if (flag & SOR_EISENSTAT) { 2853 const PetscScalar *b; 2854 MatScalar *t = a->inode.ssor_work; 2855 2856 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2857 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2858 /* 2859 Apply (U + D)^-1 where D is now the block diagonal 2860 */ 2861 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2862 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2863 ibdiag -= sizes[i]*sizes[i]; 2864 sz = ii[row+1] - diag[row] - 1; 2865 v1 = a->a + diag[row] + 1; 2866 idx = a->j + diag[row] + 1; 2867 CHKMEMQ; 2868 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2869 switch (sizes[i]){ 2870 case 1: 2871 2872 sum1 = b[row]; 2873 for(n = 0; n<sz-1; n+=2) { 2874 i1 = idx[0]; 2875 i2 = idx[1]; 2876 idx += 2; 2877 tmp0 = x[i1]; 2878 tmp1 = x[i2]; 2879 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2880 } 2881 2882 if (n == sz-1){ 2883 tmp0 = x[*idx]; 2884 sum1 -= *v1*tmp0; 2885 } 2886 x[row] = sum1*(*ibdiag);row--; 2887 break; 2888 2889 case 2: 2890 2891 sum1 = b[row]; 2892 sum2 = b[row-1]; 2893 /* note that sum1 is associated with the second of the two rows */ 2894 v2 = a->a + diag[row-1] + 2; 2895 for(n = 0; n<sz-1; n+=2) { 2896 i1 = idx[0]; 2897 i2 = idx[1]; 2898 idx += 2; 2899 tmp0 = x[i1]; 2900 tmp1 = x[i2]; 2901 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2902 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2903 } 2904 2905 if (n == sz-1){ 2906 tmp0 = x[*idx]; 2907 sum1 -= *v1*tmp0; 2908 sum2 -= *v2*tmp0; 2909 } 2910 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2911 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2912 row -= 2; 2913 break; 2914 case 3: 2915 2916 sum1 = b[row]; 2917 sum2 = b[row-1]; 2918 sum3 = b[row-2]; 2919 v2 = a->a + diag[row-1] + 2; 2920 v3 = a->a + diag[row-2] + 3; 2921 for(n = 0; n<sz-1; n+=2) { 2922 i1 = idx[0]; 2923 i2 = idx[1]; 2924 idx += 2; 2925 tmp0 = x[i1]; 2926 tmp1 = x[i2]; 2927 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2930 } 2931 2932 if (n == sz-1){ 2933 tmp0 = x[*idx]; 2934 sum1 -= *v1*tmp0; 2935 sum2 -= *v2*tmp0; 2936 sum3 -= *v3*tmp0; 2937 } 2938 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2939 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2940 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2941 row -= 3; 2942 break; 2943 case 4: 2944 2945 sum1 = b[row]; 2946 sum2 = b[row-1]; 2947 sum3 = b[row-2]; 2948 sum4 = b[row-3]; 2949 v2 = a->a + diag[row-1] + 2; 2950 v3 = a->a + diag[row-2] + 3; 2951 v4 = a->a + diag[row-3] + 4; 2952 for(n = 0; n<sz-1; n+=2) { 2953 i1 = idx[0]; 2954 i2 = idx[1]; 2955 idx += 2; 2956 tmp0 = x[i1]; 2957 tmp1 = x[i2]; 2958 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2959 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2960 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2961 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2962 } 2963 2964 if (n == sz-1){ 2965 tmp0 = x[*idx]; 2966 sum1 -= *v1*tmp0; 2967 sum2 -= *v2*tmp0; 2968 sum3 -= *v3*tmp0; 2969 sum4 -= *v4*tmp0; 2970 } 2971 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2972 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2973 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2974 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2975 row -= 4; 2976 break; 2977 case 5: 2978 2979 sum1 = b[row]; 2980 sum2 = b[row-1]; 2981 sum3 = b[row-2]; 2982 sum4 = b[row-3]; 2983 sum5 = b[row-4]; 2984 v2 = a->a + diag[row-1] + 2; 2985 v3 = a->a + diag[row-2] + 3; 2986 v4 = a->a + diag[row-3] + 4; 2987 v5 = a->a + diag[row-4] + 5; 2988 for(n = 0; n<sz-1; n+=2) { 2989 i1 = idx[0]; 2990 i2 = idx[1]; 2991 idx += 2; 2992 tmp0 = x[i1]; 2993 tmp1 = x[i2]; 2994 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2995 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2996 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2997 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2998 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2999 } 3000 3001 if (n == sz-1){ 3002 tmp0 = x[*idx]; 3003 sum1 -= *v1*tmp0; 3004 sum2 -= *v2*tmp0; 3005 sum3 -= *v3*tmp0; 3006 sum4 -= *v4*tmp0; 3007 sum5 -= *v5*tmp0; 3008 } 3009 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3010 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3011 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3012 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3013 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3014 row -= 5; 3015 break; 3016 default: 3017 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3018 } 3019 CHKMEMQ; 3020 } 3021 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3022 3023 /* 3024 t = b - D x where D is the block diagonal 3025 */ 3026 cnt = 0; 3027 for (i=0, row=0; i<m; i++) { 3028 CHKMEMQ; 3029 switch (sizes[i]){ 3030 case 1: 3031 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3032 break; 3033 case 2: 3034 x1 = x[row]; x2 = x[row+1]; 3035 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3036 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3037 t[row] = b[row] - tmp1; 3038 t[row+1] = b[row+1] - tmp2; row += 2; 3039 cnt += 4; 3040 break; 3041 case 3: 3042 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3043 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3044 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3045 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3046 t[row] = b[row] - tmp1; 3047 t[row+1] = b[row+1] - tmp2; 3048 t[row+2] = b[row+2] - tmp3; row += 3; 3049 cnt += 9; 3050 break; 3051 case 4: 3052 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3053 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3054 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3055 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3056 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3057 t[row] = b[row] - tmp1; 3058 t[row+1] = b[row+1] - tmp2; 3059 t[row+2] = b[row+2] - tmp3; 3060 t[row+3] = b[row+3] - tmp4; row += 4; 3061 cnt += 16; 3062 break; 3063 case 5: 3064 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3065 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3066 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3067 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3068 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3069 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3070 t[row] = b[row] - tmp1; 3071 t[row+1] = b[row+1] - tmp2; 3072 t[row+2] = b[row+2] - tmp3; 3073 t[row+3] = b[row+3] - tmp4; 3074 t[row+4] = b[row+4] - tmp5;row += 5; 3075 cnt += 25; 3076 break; 3077 default: 3078 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3079 } 3080 CHKMEMQ; 3081 } 3082 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3083 3084 3085 3086 /* 3087 Apply (L + D)^-1 where D is the block diagonal 3088 */ 3089 for (i=0, row=0; i<m; i++) { 3090 sz = diag[row] - ii[row]; 3091 v1 = a->a + ii[row]; 3092 idx = a->j + ii[row]; 3093 CHKMEMQ; 3094 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3095 switch (sizes[i]){ 3096 case 1: 3097 3098 sum1 = t[row]; 3099 for(n = 0; n<sz-1; n+=2) { 3100 i1 = idx[0]; 3101 i2 = idx[1]; 3102 idx += 2; 3103 tmp0 = t[i1]; 3104 tmp1 = t[i2]; 3105 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3106 } 3107 3108 if (n == sz-1){ 3109 tmp0 = t[*idx]; 3110 sum1 -= *v1 * tmp0; 3111 } 3112 x[row] += t[row] = sum1*(*ibdiag++); row++; 3113 break; 3114 case 2: 3115 v2 = a->a + ii[row+1]; 3116 sum1 = t[row]; 3117 sum2 = t[row+1]; 3118 for(n = 0; n<sz-1; n+=2) { 3119 i1 = idx[0]; 3120 i2 = idx[1]; 3121 idx += 2; 3122 tmp0 = t[i1]; 3123 tmp1 = t[i2]; 3124 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3125 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3126 } 3127 3128 if (n == sz-1){ 3129 tmp0 = t[*idx]; 3130 sum1 -= v1[0] * tmp0; 3131 sum2 -= v2[0] * tmp0; 3132 } 3133 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3134 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3135 ibdiag += 4; row += 2; 3136 break; 3137 case 3: 3138 v2 = a->a + ii[row+1]; 3139 v3 = a->a + ii[row+2]; 3140 sum1 = t[row]; 3141 sum2 = t[row+1]; 3142 sum3 = t[row+2]; 3143 for(n = 0; n<sz-1; n+=2) { 3144 i1 = idx[0]; 3145 i2 = idx[1]; 3146 idx += 2; 3147 tmp0 = t[i1]; 3148 tmp1 = t[i2]; 3149 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3150 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3151 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3152 } 3153 3154 if (n == sz-1){ 3155 tmp0 = t[*idx]; 3156 sum1 -= v1[0] * tmp0; 3157 sum2 -= v2[0] * tmp0; 3158 sum3 -= v3[0] * tmp0; 3159 } 3160 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3161 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3162 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3163 ibdiag += 9; row += 3; 3164 break; 3165 case 4: 3166 v2 = a->a + ii[row+1]; 3167 v3 = a->a + ii[row+2]; 3168 v4 = a->a + ii[row+3]; 3169 sum1 = t[row]; 3170 sum2 = t[row+1]; 3171 sum3 = t[row+2]; 3172 sum4 = t[row+3]; 3173 for(n = 0; n<sz-1; n+=2) { 3174 i1 = idx[0]; 3175 i2 = idx[1]; 3176 idx += 2; 3177 tmp0 = t[i1]; 3178 tmp1 = t[i2]; 3179 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3180 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3181 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3182 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3183 } 3184 3185 if (n == sz-1){ 3186 tmp0 = t[*idx]; 3187 sum1 -= v1[0] * tmp0; 3188 sum2 -= v2[0] * tmp0; 3189 sum3 -= v3[0] * tmp0; 3190 sum4 -= v4[0] * tmp0; 3191 } 3192 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3193 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3194 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3195 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3196 ibdiag += 16; row += 4; 3197 break; 3198 case 5: 3199 v2 = a->a + ii[row+1]; 3200 v3 = a->a + ii[row+2]; 3201 v4 = a->a + ii[row+3]; 3202 v5 = a->a + ii[row+4]; 3203 sum1 = t[row]; 3204 sum2 = t[row+1]; 3205 sum3 = t[row+2]; 3206 sum4 = t[row+3]; 3207 sum5 = t[row+4]; 3208 for(n = 0; n<sz-1; n+=2) { 3209 i1 = idx[0]; 3210 i2 = idx[1]; 3211 idx += 2; 3212 tmp0 = t[i1]; 3213 tmp1 = t[i2]; 3214 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3215 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3216 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3217 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3218 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3219 } 3220 3221 if (n == sz-1){ 3222 tmp0 = t[*idx]; 3223 sum1 -= v1[0] * tmp0; 3224 sum2 -= v2[0] * tmp0; 3225 sum3 -= v3[0] * tmp0; 3226 sum4 -= v4[0] * tmp0; 3227 sum5 -= v5[0] * tmp0; 3228 } 3229 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3230 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3231 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3232 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3233 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3234 ibdiag += 25; row += 5; 3235 break; 3236 default: 3237 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3238 } 3239 CHKMEMQ; 3240 } 3241 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3242 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3243 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3244 } 3245 PetscFunctionReturn(0); 3246 } 3247 3248 #undef __FUNCT__ 3249 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3250 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3251 { 3252 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3253 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3254 const MatScalar *bdiag = a->inode.bdiag; 3255 const PetscScalar *b; 3256 PetscErrorCode ierr; 3257 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3258 const PetscInt *sizes = a->inode.size; 3259 3260 PetscFunctionBegin; 3261 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3262 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3263 cnt = 0; 3264 for (i=0, row=0; i<m; i++) { 3265 switch (sizes[i]){ 3266 case 1: 3267 x[row] = b[row]*bdiag[cnt++];row++; 3268 break; 3269 case 2: 3270 x1 = b[row]; x2 = b[row+1]; 3271 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3272 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3273 x[row++] = tmp1; 3274 x[row++] = tmp2; 3275 cnt += 4; 3276 break; 3277 case 3: 3278 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3279 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3280 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3281 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3282 x[row++] = tmp1; 3283 x[row++] = tmp2; 3284 x[row++] = tmp3; 3285 cnt += 9; 3286 break; 3287 case 4: 3288 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3289 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3290 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3291 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3292 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3293 x[row++] = tmp1; 3294 x[row++] = tmp2; 3295 x[row++] = tmp3; 3296 x[row++] = tmp4; 3297 cnt += 16; 3298 break; 3299 case 5: 3300 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3301 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3302 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3303 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3304 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3305 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3306 x[row++] = tmp1; 3307 x[row++] = tmp2; 3308 x[row++] = tmp3; 3309 x[row++] = tmp4; 3310 x[row++] = tmp5; 3311 cnt += 25; 3312 break; 3313 default: 3314 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3315 } 3316 } 3317 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3318 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3319 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3320 PetscFunctionReturn(0); 3321 } 3322 3323 /* 3324 samestructure indicates that the matrix has not changed its nonzero structure so we 3325 do not need to recompute the inodes 3326 */ 3327 #undef __FUNCT__ 3328 #define __FUNCT__ "Mat_CheckInode" 3329 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3330 { 3331 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3332 PetscErrorCode ierr; 3333 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3334 PetscTruth flag; 3335 3336 PetscFunctionBegin; 3337 if (!a->inode.use) PetscFunctionReturn(0); 3338 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3339 3340 3341 m = A->rmap->n; 3342 if (a->inode.size) {ns = a->inode.size;} 3343 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3344 3345 i = 0; 3346 node_count = 0; 3347 idx = a->j; 3348 ii = a->i; 3349 while (i < m){ /* For each row */ 3350 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3351 /* Limits the number of elements in a node to 'a->inode.limit' */ 3352 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3353 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3354 if (nzy != nzx) break; 3355 idy += nzx; /* Same nonzero pattern */ 3356 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3357 if (!flag) break; 3358 } 3359 ns[node_count++] = blk_size; 3360 idx += blk_size*nzx; 3361 i = j; 3362 } 3363 /* If not enough inodes found,, do not use inode version of the routines */ 3364 if (!a->inode.size && m && node_count > .9*m) { 3365 ierr = PetscFree(ns);CHKERRQ(ierr); 3366 a->inode.node_count = 0; 3367 a->inode.size = PETSC_NULL; 3368 a->inode.use = PETSC_FALSE; 3369 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3370 } else { 3371 A->ops->mult = MatMult_SeqAIJ_Inode; 3372 A->ops->sor = MatSOR_SeqAIJ_Inode; 3373 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3374 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3375 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3376 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3377 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3378 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3379 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3380 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode_inplace; 3381 a->inode.node_count = node_count; 3382 a->inode.size = ns; 3383 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3384 } 3385 PetscFunctionReturn(0); 3386 } 3387 3388 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3389 PetscInt __k, *__vi; \ 3390 __vi = aj + ai[row]; \ 3391 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3392 __vi = aj + adiag[row]; \ 3393 cols[nzl] = __vi[0];\ 3394 __vi = aj + adiag[row+1]+1;\ 3395 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3396 3397 3398 /* 3399 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3400 Modified from Mat_CheckInode(). 3401 3402 Input Parameters: 3403 + Mat A - ILU or LU matrix factor 3404 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3405 do not need to recompute the inodes 3406 */ 3407 #undef __FUNCT__ 3408 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3409 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3410 { 3411 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3412 PetscErrorCode ierr; 3413 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3414 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3415 PetscTruth flag; 3416 3417 PetscFunctionBegin; 3418 if (!a->inode.use) PetscFunctionReturn(0); 3419 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3420 3421 m = A->rmap->n; 3422 if (a->inode.size) {ns = a->inode.size;} 3423 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3424 3425 i = 0; 3426 node_count = 0; 3427 while (i < m){ /* For each row */ 3428 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3429 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3430 nzx = nzl1 + nzu1 + 1; 3431 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3432 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3433 3434 /* Limits the number of elements in a node to 'a->inode.limit' */ 3435 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3436 nzl2 = ai[j+1] - ai[j]; 3437 nzu2 = adiag[j] - adiag[j+1] - 1; 3438 nzy = nzl2 + nzu2 + 1; 3439 if( nzy != nzx) break; 3440 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3441 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3442 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3443 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3444 ierr = PetscFree(cols2);CHKERRQ(ierr); 3445 } 3446 ns[node_count++] = blk_size; 3447 ierr = PetscFree(cols1);CHKERRQ(ierr); 3448 i = j; 3449 } 3450 /* If not enough inodes found,, do not use inode version of the routines */ 3451 if (!a->inode.size && m && node_count > .9*m) { 3452 ierr = PetscFree(ns);CHKERRQ(ierr); 3453 a->inode.node_count = 0; 3454 a->inode.size = PETSC_NULL; 3455 a->inode.use = PETSC_FALSE; 3456 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3457 } else { 3458 A->ops->mult = 0; 3459 A->ops->sor = 0; 3460 A->ops->multadd = 0; 3461 A->ops->getrowij = 0; 3462 A->ops->restorerowij = 0; 3463 A->ops->getcolumnij = 0; 3464 A->ops->restorecolumnij = 0; 3465 A->ops->coloringpatch = 0; 3466 A->ops->multdiagonalblock = 0; 3467 /* A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode; not done yet */ 3468 a->inode.node_count = node_count; 3469 a->inode.size = ns; 3470 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3471 } 3472 PetscFunctionReturn(0); 3473 } 3474 3475 /* 3476 This is really ugly. if inodes are used this replaces the 3477 permutations with ones that correspond to rows/cols of the matrix 3478 rather then inode blocks 3479 */ 3480 #undef __FUNCT__ 3481 #define __FUNCT__ "MatInodeAdjustForInodes" 3482 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3483 { 3484 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3485 3486 PetscFunctionBegin; 3487 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3488 if (f) { 3489 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3490 } 3491 PetscFunctionReturn(0); 3492 } 3493 3494 EXTERN_C_BEGIN 3495 #undef __FUNCT__ 3496 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3497 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3498 { 3499 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3500 PetscErrorCode ierr; 3501 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3502 const PetscInt *ridx,*cidx; 3503 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3504 PetscInt nslim_col,*ns_col; 3505 IS ris = *rperm,cis = *cperm; 3506 3507 PetscFunctionBegin; 3508 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3509 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3510 3511 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3512 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3513 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3514 3515 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3516 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3517 3518 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3519 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3520 3521 /* Construct the permutations for rows*/ 3522 for (i=0,row = 0; i<nslim_row; ++i){ 3523 indx = ridx[i]; 3524 start_val = tns[indx]; 3525 end_val = tns[indx + 1]; 3526 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3527 } 3528 3529 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3530 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3531 3532 /* Construct permutations for columns */ 3533 for (i=0,col=0; i<nslim_col; ++i){ 3534 indx = cidx[i]; 3535 start_val = tns[indx]; 3536 end_val = tns[indx + 1]; 3537 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3538 } 3539 3540 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3541 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3542 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3543 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3544 3545 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3546 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3547 3548 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3549 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3550 ierr = ISDestroy(cis);CHKERRQ(ierr); 3551 ierr = ISDestroy(ris);CHKERRQ(ierr); 3552 ierr = PetscFree(tns);CHKERRQ(ierr); 3553 PetscFunctionReturn(0); 3554 } 3555 EXTERN_C_END 3556 3557 #undef __FUNCT__ 3558 #define __FUNCT__ "MatInodeGetInodeSizes" 3559 /*@C 3560 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3561 3562 Collective on Mat 3563 3564 Input Parameter: 3565 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3566 3567 Output Parameter: 3568 + node_count - no of inodes present in the matrix. 3569 . sizes - an array of size node_count,with sizes of each inode. 3570 - limit - the max size used to generate the inodes. 3571 3572 Level: advanced 3573 3574 Notes: This routine returns some internal storage information 3575 of the matrix, it is intended to be used by advanced users. 3576 It should be called after the matrix is assembled. 3577 The contents of the sizes[] array should not be changed. 3578 PETSC_NULL may be passed for information not requested. 3579 3580 .keywords: matrix, seqaij, get, inode 3581 3582 .seealso: MatGetInfo() 3583 @*/ 3584 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3585 { 3586 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3587 3588 PetscFunctionBegin; 3589 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3590 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3591 if (f) { 3592 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3593 } 3594 PetscFunctionReturn(0); 3595 } 3596 3597 EXTERN_C_BEGIN 3598 #undef __FUNCT__ 3599 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3600 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3601 { 3602 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3603 3604 PetscFunctionBegin; 3605 if (node_count) *node_count = a->inode.node_count; 3606 if (sizes) *sizes = a->inode.size; 3607 if (limit) *limit = a->inode.limit; 3608 PetscFunctionReturn(0); 3609 } 3610 EXTERN_C_END 3611