1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_Inode" 586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_Inode" 779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 const PetscInt *r,*c,*rout,*cout; 785 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 786 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 821 for(j=0; j<nz-1; j+=2){ 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 } 829 if(j == nz-1){ 830 tmp0 = tmps[*vi++]; 831 sum1 -= *v1++ *tmp0; 832 } 833 tmp[row ++]=sum1; 834 break; 835 case 2: 836 sum1 = b[*r++]; 837 sum2 = b[*r++]; 838 v2 = aa + ai[row+1]; 839 840 for(j=0; j<nz-1; j+=2){ 841 i0 = vi[0]; 842 i1 = vi[1]; 843 vi +=2; 844 tmp0 = tmps[i0]; 845 tmp1 = tmps[i1]; 846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 848 } 849 if(j == nz-1){ 850 tmp0 = tmps[*vi++]; 851 sum1 -= *v1++ *tmp0; 852 sum2 -= *v2++ *tmp0; 853 } 854 sum2 -= *v2++ * sum1; 855 tmp[row ++]=sum1; 856 tmp[row ++]=sum2; 857 break; 858 case 3: 859 sum1 = b[*r++]; 860 sum2 = b[*r++]; 861 sum3 = b[*r++]; 862 v2 = aa + ai[row+1]; 863 v3 = aa + ai[row+2]; 864 865 for (j=0; j<nz-1; j+=2){ 866 i0 = vi[0]; 867 i1 = vi[1]; 868 vi +=2; 869 tmp0 = tmps[i0]; 870 tmp1 = tmps[i1]; 871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 874 } 875 if (j == nz-1){ 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ *tmp0; 878 sum2 -= *v2++ *tmp0; 879 sum3 -= *v3++ *tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 tmp[row ++]=sum1; 885 tmp[row ++]=sum2; 886 tmp[row ++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2){ 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1){ 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row ++]=sum1; 924 tmp[row ++]=sum2; 925 tmp[row ++]=sum3; 926 tmp[row ++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2){ 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1){ 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row ++]=sum1; 972 tmp[row ++]=sum2; 973 tmp[row ++]=sum3; 974 tmp[row ++]=sum4; 975 tmp[row ++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz){ /* Each loop in 'case' is unrolled */ 989 case 1 : 990 sum1 = tmp[row]; 991 992 for(j=nz ; j>1; j-=2){ 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1){ 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2 : 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz ; j>1; j-=2){ 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1){ 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3 : 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz ; j>1; j-=2){ 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1){ 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4 : 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz ; j>1; j-=2){ 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1){ 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5 : 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz ; j>1; j-=2){ 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1){ 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1174 /* 1175 Not currently used because the basic version is often a good amount faster 1176 */ 1177 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info) 1178 { 1179 Mat C = B; 1180 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1181 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1182 PetscErrorCode ierr; 1183 const PetscInt *ns,*r,*ic,*c,*ics,*bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,*bi = b->i,*ai = a->i,*aj = a->j,*bd = b->diag,*pj; 1184 PetscInt n = A->rmap->n; 1185 PetscInt nz,nz_tmp,row,prow; 1186 PetscInt i,j,idx,node_max,nodesz; 1187 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1188 PetscScalar mul1,mul2,mul3,tmp; 1189 const MatScalar *pv,*v1,*v2,*v3,*aa = a->a,*rtmp1; 1190 MatScalar *rtmp11,*pc1,*pc2,*pc3,*ba = b->a,*rtmp22,*rtmp33;; 1191 PetscReal rs=0.0; 1192 LUShift_Ctx sctx; 1193 PetscInt newshift; 1194 1195 PetscFunctionBegin; 1196 sctx.shift_top = 0; 1197 sctx.nshift_max = 0; 1198 sctx.shift_lo = 0; 1199 sctx.shift_hi = 0; 1200 sctx.shift_fraction = 0; 1201 1202 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1203 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1204 sctx.shift_top = 0; 1205 for (i=0; i<n; i++) { 1206 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1207 rs = 0.0; 1208 ajtmp = aj + ai[i]; 1209 rtmp1 = aa + ai[i]; 1210 nz = ai[i+1] - ai[i]; 1211 for (j=0; j<nz; j++){ 1212 if (*ajtmp != i){ 1213 rs += PetscAbsScalar(*rtmp1++); 1214 } else { 1215 rs -= PetscRealPart(*rtmp1++); 1216 } 1217 ajtmp++; 1218 } 1219 if (rs>sctx.shift_top) sctx.shift_top = rs; 1220 } 1221 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1222 sctx.shift_top *= 1.1; 1223 sctx.nshift_max = 5; 1224 sctx.shift_lo = 0.; 1225 sctx.shift_hi = 1.; 1226 } 1227 sctx.shift_amount = 0; 1228 sctx.nshift = 0; 1229 1230 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1231 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1232 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1233 ierr = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr); 1234 ics = ic ; 1235 1236 node_max = a->inode.node_count; 1237 ns = a->inode.size; 1238 if (!ns){ 1239 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1240 } 1241 1242 /* If max inode size > 3, split it into two inodes.*/ 1243 /* also map the inode sizes according to the ordering */ 1244 ierr = PetscMalloc2(n,PetscInt,&tmp_vec1,n,PetscInt,&nsmap);CHKERRQ(ierr); 1245 for (i=0,j=0; i<node_max; ++i,++j){ 1246 if (ns[i]>3) { 1247 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1248 ++j; 1249 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1250 } else { 1251 tmp_vec1[j] = ns[i]; 1252 } 1253 } 1254 /* Use the correct node_max */ 1255 node_max = j; 1256 1257 /* Now reorder the inode info based on mat re-ordering info */ 1258 /* First create a row -> inode_size_array_index map */ 1259 ierr = PetscMalloc(node_max*sizeof(PetscInt),&tmp_vec2);CHKERRQ(ierr); 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j=0; j<nodesz; j++,row++) { 1263 nsmap[row] = i; 1264 } 1265 } 1266 /* Using nsmap, create a reordered ns structure */ 1267 for (i=0,j=0; i< node_max; i++) { 1268 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1269 tmp_vec2[i] = nodesz; 1270 j += nodesz; 1271 } 1272 ierr = PetscFree2(tmp_vec1,nsmap);CHKERRQ(ierr); 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.lushift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (i=0,row=0; i<node_max; i++) { 1280 nodesz = ns[i]; 1281 nz = bi[row+1] - bi[row]; 1282 bjtmp = bj + bi[row]; 1283 1284 switch (nodesz){ 1285 case 1: 1286 for (j=0; j<nz; j++){ 1287 rtmp11[bjtmp[j]] = 0.0; 1288 } 1289 1290 /* load in initial (unfactored row) */ 1291 idx = r[row]; 1292 nz_tmp = ai[idx+1] - ai[idx]; 1293 ajtmp = aj + ai[idx]; 1294 v1 = aa + ai[idx]; 1295 1296 for (j=0; j<nz_tmp; j++) { 1297 idx = ics[ajtmp[j]]; 1298 rtmp11[idx] = v1[j]; 1299 } 1300 rtmp11[ics[r[row]]] += sctx.shift_amount; 1301 1302 prow = *bjtmp++ ; 1303 while (prow < row) { 1304 pc1 = rtmp11 + prow; 1305 if (*pc1 != 0.0){ 1306 pv = ba + bd[prow]; 1307 pj = nbj + bd[prow]; 1308 mul1 = *pc1 * *pv++; 1309 *pc1 = mul1; 1310 nz_tmp = bi[prow+1] - bd[prow] - 1; 1311 ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr); 1312 for (j=0; j<nz_tmp; j++) { 1313 rtmp11[pj[i]] -= mul1 * pv[j]; 1314 } 1315 } 1316 prow = *bjtmp++ ; 1317 } 1318 pj = bj + bi[row]; 1319 pc1 = ba + bi[row]; 1320 1321 sctx.pv = rtmp11[row]; 1322 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1323 rs = 0.0; 1324 for (j=0; j<nz; j++) { 1325 idx = pj[j]; 1326 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1327 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1328 } 1329 sctx.rs = rs; 1330 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1331 if (newshift == 1) goto endofwhile; 1332 break; 1333 1334 case 2: 1335 for (j=0; j<nz; j++) { 1336 idx = bjtmp[j]; 1337 rtmp11[idx] = 0.0; 1338 rtmp22[idx] = 0.0; 1339 } 1340 1341 /* load in initial (unfactored row) */ 1342 idx = r[row]; 1343 nz_tmp = ai[idx+1] - ai[idx]; 1344 ajtmp = aj + ai[idx]; 1345 v1 = aa + ai[idx]; 1346 v2 = aa + ai[idx+1]; 1347 for (j=0; j<nz_tmp; j++) { 1348 idx = ics[ajtmp[j]]; 1349 rtmp11[idx] = v1[j]; 1350 rtmp22[idx] = v2[j]; 1351 } 1352 rtmp11[ics[r[row]]] += sctx.shift_amount; 1353 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1354 1355 prow = *bjtmp++ ; 1356 while (prow < row) { 1357 pc1 = rtmp11 + prow; 1358 pc2 = rtmp22 + prow; 1359 if (*pc1 != 0.0 || *pc2 != 0.0){ 1360 pv = ba + bd[prow]; 1361 pj = nbj + bd[prow]; 1362 mul1 = *pc1 * *pv; 1363 mul2 = *pc2 * *pv; 1364 ++pv; 1365 *pc1 = mul1; 1366 *pc2 = mul2; 1367 1368 nz_tmp = bi[prow+1] - bd[prow] - 1; 1369 for (j=0; j<nz_tmp; j++) { 1370 tmp = pv[j]; 1371 idx = pj[j]; 1372 rtmp11[idx] -= mul1 * tmp; 1373 rtmp22[idx] -= mul2 * tmp; 1374 } 1375 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1376 } 1377 prow = *bjtmp++ ; 1378 } 1379 1380 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1381 pc1 = rtmp11 + prow; 1382 pc2 = rtmp22 + prow; 1383 1384 sctx.pv = *pc1; 1385 pj = bj + bi[prow]; 1386 rs = 0.0; 1387 for (j=0; j<nz; j++){ 1388 idx = pj[j]; 1389 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1390 } 1391 sctx.rs = rs; 1392 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1393 if (newshift == 1) goto endofwhile; 1394 1395 if (*pc2 != 0.0){ 1396 pj = nbj + bd[prow]; 1397 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1398 *pc2 = mul2; 1399 nz_tmp = bi[prow+1] - bd[prow] - 1; 1400 for (j=0; j<nz_tmp; j++) { 1401 idx = pj[j] ; 1402 tmp = rtmp11[idx]; 1403 rtmp22[idx] -= mul2 * tmp; 1404 } 1405 ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr); 1406 } 1407 1408 pj = bj + bi[row]; 1409 pc1 = ba + bi[row]; 1410 pc2 = ba + bi[row+1]; 1411 1412 sctx.pv = rtmp22[row+1]; 1413 rs = 0.0; 1414 rtmp11[row] = 1.0/rtmp11[row]; 1415 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1416 /* copy row entries from dense representation to sparse */ 1417 for (j=0; j<nz; j++) { 1418 idx = pj[j]; 1419 pc1[j] = rtmp11[idx]; 1420 pc2[j] = rtmp22[idx]; 1421 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1422 } 1423 sctx.rs = rs; 1424 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1425 if (newshift == 1) goto endofwhile; 1426 break; 1427 1428 case 3: 1429 for (j=0; j<nz; j++) { 1430 idx = bjtmp[j]; 1431 rtmp11[idx] = 0.0; 1432 rtmp22[idx] = 0.0; 1433 rtmp33[idx] = 0.0; 1434 } 1435 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1436 idx = r[row]; 1437 nz_tmp = ai[idx+1] - ai[idx]; 1438 ajtmp = aj + ai[idx]; 1439 v1 = aa + ai[idx]; 1440 v2 = aa + ai[idx+1]; 1441 v3 = aa + ai[idx+2]; 1442 for (j=0; j<nz_tmp; j++) { 1443 idx = ics[ajtmp[j]]; 1444 rtmp11[idx] = v1[j]; 1445 rtmp22[idx] = v2[j]; 1446 rtmp33[idx] = v3[j]; 1447 } 1448 rtmp11[ics[r[row]]] += sctx.shift_amount; 1449 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1450 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1451 1452 /* loop over all pivot row blocks above this row block */ 1453 prow = *bjtmp++ ; 1454 while (prow < row) { 1455 pc1 = rtmp11 + prow; 1456 pc2 = rtmp22 + prow; 1457 pc3 = rtmp33 + prow; 1458 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1459 pv = ba + bd[prow]; 1460 pj = nbj + bd[prow]; 1461 mul1 = *pc1 * *pv; 1462 mul2 = *pc2 * *pv; 1463 mul3 = *pc3 * *pv; 1464 ++pv; 1465 *pc1 = mul1; 1466 *pc2 = mul2; 1467 *pc3 = mul3; 1468 1469 nz_tmp = bi[prow+1] - bd[prow] - 1; 1470 /* update this row based on pivot row */ 1471 for (j=0; j<nz_tmp; j++) { 1472 tmp = pv[j]; 1473 idx = pj[j]; 1474 rtmp11[idx] -= mul1 * tmp; 1475 rtmp22[idx] -= mul2 * tmp; 1476 rtmp33[idx] -= mul3 * tmp; 1477 } 1478 ierr = PetscLogFlops(6.0*nz_tmp);CHKERRQ(ierr); 1479 } 1480 prow = *bjtmp++ ; 1481 } 1482 1483 /* Now take care of diagonal 3x3 block in this set of rows */ 1484 /* note: prow = row here */ 1485 pc1 = rtmp11 + prow; 1486 pc2 = rtmp22 + prow; 1487 pc3 = rtmp33 + prow; 1488 1489 sctx.pv = *pc1; 1490 pj = bj + bi[prow]; 1491 rs = 0.0; 1492 for (j=0; j<nz; j++){ 1493 idx = pj[j]; 1494 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1495 } 1496 sctx.rs = rs; 1497 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1498 if (newshift == 1) goto endofwhile; 1499 1500 if (*pc2 != 0.0 || *pc3 != 0.0){ 1501 mul2 = (*pc2)/(*pc1); 1502 mul3 = (*pc3)/(*pc1); 1503 *pc2 = mul2; 1504 *pc3 = mul3; 1505 nz_tmp = bi[prow+1] - bd[prow] - 1; 1506 pj = nbj + bd[prow]; 1507 for (j=0; j<nz_tmp; j++) { 1508 idx = pj[j] ; 1509 tmp = rtmp11[idx]; 1510 rtmp22[idx] -= mul2 * tmp; 1511 rtmp33[idx] -= mul3 * tmp; 1512 } 1513 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1514 } 1515 ++prow; 1516 1517 pc2 = rtmp22 + prow; 1518 pc3 = rtmp33 + prow; 1519 sctx.pv = *pc2; 1520 pj = bj + bi[prow]; 1521 rs = 0.0; 1522 for (j=0; j<nz; j++){ 1523 idx = pj[j]; 1524 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1525 } 1526 sctx.rs = rs; 1527 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1528 if (newshift == 1) goto endofwhile; 1529 1530 if (*pc3 != 0.0){ 1531 mul3 = (*pc3)/(*pc2); 1532 *pc3 = mul3; 1533 pj = nbj + bd[prow]; 1534 nz_tmp = bi[prow+1] - bd[prow] - 1; 1535 for (j=0; j<nz_tmp; j++) { 1536 idx = pj[j] ; 1537 tmp = rtmp22[idx]; 1538 rtmp33[idx] -= mul3 * tmp; 1539 } 1540 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1541 } 1542 1543 pj = bj + bi[row]; 1544 pc1 = ba + bi[row]; 1545 pc2 = ba + bi[row+1]; 1546 pc3 = ba + bi[row+2]; 1547 1548 sctx.pv = rtmp33[row+2]; 1549 rs = 0.0; 1550 rtmp11[row] = 1.0/rtmp11[row]; 1551 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1552 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1553 /* copy row entries from dense representation to sparse */ 1554 for (j=0; j<nz; j++) { 1555 idx = pj[j]; 1556 pc1[j] = rtmp11[idx]; 1557 pc2[j] = rtmp22[idx]; 1558 pc3[j] = rtmp33[idx]; 1559 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1560 } 1561 1562 sctx.rs = rs; 1563 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1564 if (newshift == 1) goto endofwhile; 1565 break; 1566 1567 default: 1568 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1569 } 1570 row += nodesz; /* Update the row */ 1571 } 1572 endofwhile:; 1573 } while (sctx.lushift); 1574 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 1575 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1576 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1577 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1578 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1579 (B)->ops->solve = MatSolve_Inode; 1580 /* do not set solve add, since MatSolve_Inode + Add is faster */ 1581 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1582 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1583 C->assembled = PETSC_TRUE; 1584 C->preallocated = PETSC_TRUE; 1585 if (sctx.nshift) { 1586 if (info->shiftpd) { 1587 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1588 } else if (info->shiftnz) { 1589 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1590 } 1591 } 1592 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1593 PetscFunctionReturn(0); 1594 } 1595 1596 /* 1597 Makes a longer coloring[] array and calls the usual code with that 1598 */ 1599 #undef __FUNCT__ 1600 #define __FUNCT__ "MatColoringPatch_Inode" 1601 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1602 { 1603 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1604 PetscErrorCode ierr; 1605 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1606 PetscInt *colorused,i; 1607 ISColoringValue *newcolor; 1608 1609 PetscFunctionBegin; 1610 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1611 /* loop over inodes, marking a color for each column*/ 1612 row = 0; 1613 for (i=0; i<m; i++){ 1614 for (j=0; j<ns[i]; j++) { 1615 newcolor[row++] = coloring[i] + j*ncolors; 1616 } 1617 } 1618 1619 /* eliminate unneeded colors */ 1620 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1621 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1622 for (i=0; i<n; i++) { 1623 colorused[newcolor[i]] = 1; 1624 } 1625 1626 for (i=1; i<5*ncolors; i++) { 1627 colorused[i] += colorused[i-1]; 1628 } 1629 ncolors = colorused[5*ncolors-1]; 1630 for (i=0; i<n; i++) { 1631 newcolor[i] = colorused[newcolor[i]]-1; 1632 } 1633 ierr = PetscFree(colorused);CHKERRQ(ierr); 1634 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1635 ierr = PetscFree(coloring);CHKERRQ(ierr); 1636 PetscFunctionReturn(0); 1637 } 1638 1639 #include "../src/mat/blockinvert.h" 1640 1641 #undef __FUNCT__ 1642 #define __FUNCT__ "MatRelax_Inode" 1643 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1644 { 1645 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1646 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1647 MatScalar *ibdiag,*bdiag; 1648 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1649 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1650 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1651 PetscErrorCode ierr; 1652 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1653 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1654 1655 PetscFunctionBegin; 1656 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1657 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1658 if (its > 1) { 1659 /* switch to non-inode version */ 1660 ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1661 PetscFunctionReturn(0); 1662 } 1663 1664 if (!a->inode.ibdiagvalid) { 1665 if (!a->inode.ibdiag) { 1666 /* calculate space needed for diagonal blocks */ 1667 for (i=0; i<m; i++) { 1668 cnt += sizes[i]*sizes[i]; 1669 } 1670 a->inode.bdiagsize = cnt; 1671 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1672 } 1673 1674 /* copy over the diagonal blocks and invert them */ 1675 ibdiag = a->inode.ibdiag; 1676 bdiag = a->inode.bdiag; 1677 cnt = 0; 1678 for (i=0, row = 0; i<m; i++) { 1679 for (j=0; j<sizes[i]; j++) { 1680 for (k=0; k<sizes[i]; k++) { 1681 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1682 } 1683 } 1684 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1685 1686 switch(sizes[i]) { 1687 case 1: 1688 /* Create matrix data structure */ 1689 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1690 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1691 break; 1692 case 2: 1693 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1694 break; 1695 case 3: 1696 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1697 break; 1698 case 4: 1699 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1700 break; 1701 case 5: 1702 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1703 break; 1704 default: 1705 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1706 } 1707 cnt += sizes[i]*sizes[i]; 1708 row += sizes[i]; 1709 } 1710 a->inode.ibdiagvalid = PETSC_TRUE; 1711 } 1712 ibdiag = a->inode.ibdiag; 1713 bdiag = a->inode.bdiag; 1714 1715 1716 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1717 if (flag & SOR_ZERO_INITIAL_GUESS) { 1718 PetscScalar *b; 1719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1720 if (xx != bb) { 1721 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1722 } else { 1723 b = x; 1724 } 1725 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1726 1727 for (i=0, row=0; i<m; i++) { 1728 sz = diag[row] - ii[row]; 1729 v1 = a->a + ii[row]; 1730 idx = a->j + ii[row]; 1731 1732 /* see comments for MatMult_Inode() for how this is coded */ 1733 switch (sizes[i]){ 1734 case 1: 1735 1736 sum1 = b[row]; 1737 for(n = 0; n<sz-1; n+=2) { 1738 i1 = idx[0]; 1739 i2 = idx[1]; 1740 idx += 2; 1741 tmp0 = x[i1]; 1742 tmp1 = x[i2]; 1743 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1744 } 1745 1746 if (n == sz-1){ 1747 tmp0 = x[*idx]; 1748 sum1 -= *v1 * tmp0; 1749 } 1750 x[row++] = sum1*(*ibdiag++); 1751 break; 1752 case 2: 1753 v2 = a->a + ii[row+1]; 1754 sum1 = b[row]; 1755 sum2 = b[row+1]; 1756 for(n = 0; n<sz-1; n+=2) { 1757 i1 = idx[0]; 1758 i2 = idx[1]; 1759 idx += 2; 1760 tmp0 = x[i1]; 1761 tmp1 = x[i2]; 1762 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1763 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1764 } 1765 1766 if (n == sz-1){ 1767 tmp0 = x[*idx]; 1768 sum1 -= v1[0] * tmp0; 1769 sum2 -= v2[0] * tmp0; 1770 } 1771 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1772 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1773 ibdiag += 4; 1774 break; 1775 case 3: 1776 v2 = a->a + ii[row+1]; 1777 v3 = a->a + ii[row+2]; 1778 sum1 = b[row]; 1779 sum2 = b[row+1]; 1780 sum3 = b[row+2]; 1781 for(n = 0; n<sz-1; n+=2) { 1782 i1 = idx[0]; 1783 i2 = idx[1]; 1784 idx += 2; 1785 tmp0 = x[i1]; 1786 tmp1 = x[i2]; 1787 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1788 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1789 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1790 } 1791 1792 if (n == sz-1){ 1793 tmp0 = x[*idx]; 1794 sum1 -= v1[0] * tmp0; 1795 sum2 -= v2[0] * tmp0; 1796 sum3 -= v3[0] * tmp0; 1797 } 1798 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1799 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1800 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1801 ibdiag += 9; 1802 break; 1803 case 4: 1804 v2 = a->a + ii[row+1]; 1805 v3 = a->a + ii[row+2]; 1806 v4 = a->a + ii[row+3]; 1807 sum1 = b[row]; 1808 sum2 = b[row+1]; 1809 sum3 = b[row+2]; 1810 sum4 = b[row+3]; 1811 for(n = 0; n<sz-1; n+=2) { 1812 i1 = idx[0]; 1813 i2 = idx[1]; 1814 idx += 2; 1815 tmp0 = x[i1]; 1816 tmp1 = x[i2]; 1817 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1818 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1819 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1820 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1821 } 1822 1823 if (n == sz-1){ 1824 tmp0 = x[*idx]; 1825 sum1 -= v1[0] * tmp0; 1826 sum2 -= v2[0] * tmp0; 1827 sum3 -= v3[0] * tmp0; 1828 sum4 -= v4[0] * tmp0; 1829 } 1830 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1831 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1832 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1833 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1834 ibdiag += 16; 1835 break; 1836 case 5: 1837 v2 = a->a + ii[row+1]; 1838 v3 = a->a + ii[row+2]; 1839 v4 = a->a + ii[row+3]; 1840 v5 = a->a + ii[row+4]; 1841 sum1 = b[row]; 1842 sum2 = b[row+1]; 1843 sum3 = b[row+2]; 1844 sum4 = b[row+3]; 1845 sum5 = b[row+4]; 1846 for(n = 0; n<sz-1; n+=2) { 1847 i1 = idx[0]; 1848 i2 = idx[1]; 1849 idx += 2; 1850 tmp0 = x[i1]; 1851 tmp1 = x[i2]; 1852 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1853 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1854 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1855 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1856 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1857 } 1858 1859 if (n == sz-1){ 1860 tmp0 = x[*idx]; 1861 sum1 -= v1[0] * tmp0; 1862 sum2 -= v2[0] * tmp0; 1863 sum3 -= v3[0] * tmp0; 1864 sum4 -= v4[0] * tmp0; 1865 sum5 -= v5[0] * tmp0; 1866 } 1867 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1868 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1869 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1870 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1871 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1872 ibdiag += 25; 1873 break; 1874 default: 1875 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1876 } 1877 } 1878 1879 xb = x; 1880 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1881 } else xb = b; 1882 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1883 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1884 cnt = 0; 1885 for (i=0, row=0; i<m; i++) { 1886 1887 switch (sizes[i]){ 1888 case 1: 1889 x[row++] *= bdiag[cnt++]; 1890 break; 1891 case 2: 1892 x1 = x[row]; x2 = x[row+1]; 1893 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1894 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1895 x[row++] = tmp1; 1896 x[row++] = tmp2; 1897 cnt += 4; 1898 break; 1899 case 3: 1900 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1901 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1902 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1903 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1904 x[row++] = tmp1; 1905 x[row++] = tmp2; 1906 x[row++] = tmp3; 1907 cnt += 9; 1908 break; 1909 case 4: 1910 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1911 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1912 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1913 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1914 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1915 x[row++] = tmp1; 1916 x[row++] = tmp2; 1917 x[row++] = tmp3; 1918 x[row++] = tmp4; 1919 cnt += 16; 1920 break; 1921 case 5: 1922 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1923 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1924 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1925 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1926 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1927 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1928 x[row++] = tmp1; 1929 x[row++] = tmp2; 1930 x[row++] = tmp3; 1931 x[row++] = tmp4; 1932 x[row++] = tmp5; 1933 cnt += 25; 1934 break; 1935 default: 1936 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1937 } 1938 } 1939 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1940 } 1941 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1942 1943 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1944 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1945 ibdiag -= sizes[i]*sizes[i]; 1946 sz = ii[row+1] - diag[row] - 1; 1947 v1 = a->a + diag[row] + 1; 1948 idx = a->j + diag[row] + 1; 1949 1950 /* see comments for MatMult_Inode() for how this is coded */ 1951 switch (sizes[i]){ 1952 case 1: 1953 1954 sum1 = xb[row]; 1955 for(n = 0; n<sz-1; n+=2) { 1956 i1 = idx[0]; 1957 i2 = idx[1]; 1958 idx += 2; 1959 tmp0 = x[i1]; 1960 tmp1 = x[i2]; 1961 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1962 } 1963 1964 if (n == sz-1){ 1965 tmp0 = x[*idx]; 1966 sum1 -= *v1*tmp0; 1967 } 1968 x[row--] = sum1*(*ibdiag); 1969 break; 1970 1971 case 2: 1972 1973 sum1 = xb[row]; 1974 sum2 = xb[row-1]; 1975 /* note that sum1 is associated with the second of the two rows */ 1976 v2 = a->a + diag[row-1] + 2; 1977 for(n = 0; n<sz-1; n+=2) { 1978 i1 = idx[0]; 1979 i2 = idx[1]; 1980 idx += 2; 1981 tmp0 = x[i1]; 1982 tmp1 = x[i2]; 1983 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1984 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1985 } 1986 1987 if (n == sz-1){ 1988 tmp0 = x[*idx]; 1989 sum1 -= *v1*tmp0; 1990 sum2 -= *v2*tmp0; 1991 } 1992 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1993 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1994 break; 1995 case 3: 1996 1997 sum1 = xb[row]; 1998 sum2 = xb[row-1]; 1999 sum3 = xb[row-2]; 2000 v2 = a->a + diag[row-1] + 2; 2001 v3 = a->a + diag[row-2] + 3; 2002 for(n = 0; n<sz-1; n+=2) { 2003 i1 = idx[0]; 2004 i2 = idx[1]; 2005 idx += 2; 2006 tmp0 = x[i1]; 2007 tmp1 = x[i2]; 2008 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2009 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2010 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2011 } 2012 2013 if (n == sz-1){ 2014 tmp0 = x[*idx]; 2015 sum1 -= *v1*tmp0; 2016 sum2 -= *v2*tmp0; 2017 sum3 -= *v3*tmp0; 2018 } 2019 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2020 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2021 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2022 break; 2023 case 4: 2024 2025 sum1 = xb[row]; 2026 sum2 = xb[row-1]; 2027 sum3 = xb[row-2]; 2028 sum4 = xb[row-3]; 2029 v2 = a->a + diag[row-1] + 2; 2030 v3 = a->a + diag[row-2] + 3; 2031 v4 = a->a + diag[row-3] + 4; 2032 for(n = 0; n<sz-1; n+=2) { 2033 i1 = idx[0]; 2034 i2 = idx[1]; 2035 idx += 2; 2036 tmp0 = x[i1]; 2037 tmp1 = x[i2]; 2038 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2039 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2040 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2041 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2042 } 2043 2044 if (n == sz-1){ 2045 tmp0 = x[*idx]; 2046 sum1 -= *v1*tmp0; 2047 sum2 -= *v2*tmp0; 2048 sum3 -= *v3*tmp0; 2049 sum4 -= *v4*tmp0; 2050 } 2051 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2052 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2053 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2054 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2055 break; 2056 case 5: 2057 2058 sum1 = xb[row]; 2059 sum2 = xb[row-1]; 2060 sum3 = xb[row-2]; 2061 sum4 = xb[row-3]; 2062 sum5 = xb[row-4]; 2063 v2 = a->a + diag[row-1] + 2; 2064 v3 = a->a + diag[row-2] + 3; 2065 v4 = a->a + diag[row-3] + 4; 2066 v5 = a->a + diag[row-4] + 5; 2067 for(n = 0; n<sz-1; n+=2) { 2068 i1 = idx[0]; 2069 i2 = idx[1]; 2070 idx += 2; 2071 tmp0 = x[i1]; 2072 tmp1 = x[i2]; 2073 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2074 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2075 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2076 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2077 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2078 } 2079 2080 if (n == sz-1){ 2081 tmp0 = x[*idx]; 2082 sum1 -= *v1*tmp0; 2083 sum2 -= *v2*tmp0; 2084 sum3 -= *v3*tmp0; 2085 sum4 -= *v4*tmp0; 2086 sum5 -= *v5*tmp0; 2087 } 2088 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2089 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2090 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2091 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2092 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2093 break; 2094 default: 2095 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2096 } 2097 } 2098 2099 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2100 } 2101 its--; 2102 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2103 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2104 } 2105 if (flag & SOR_EISENSTAT) { 2106 const PetscScalar *b; 2107 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2108 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2109 MatScalar *t = a->inode.ssor_work; 2110 /* 2111 Apply (U + D)^-1 where D is now the block diagonal 2112 */ 2113 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2114 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2115 ibdiag -= sizes[i]*sizes[i]; 2116 sz = ii[row+1] - diag[row] - 1; 2117 v1 = a->a + diag[row] + 1; 2118 idx = a->j + diag[row] + 1; 2119 CHKMEMQ; 2120 /* see comments for MatMult_Inode() for how this is coded */ 2121 switch (sizes[i]){ 2122 case 1: 2123 2124 sum1 = b[row]; 2125 for(n = 0; n<sz-1; n+=2) { 2126 i1 = idx[0]; 2127 i2 = idx[1]; 2128 idx += 2; 2129 tmp0 = x[i1]; 2130 tmp1 = x[i2]; 2131 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2132 } 2133 2134 if (n == sz-1){ 2135 tmp0 = x[*idx]; 2136 sum1 -= *v1*tmp0; 2137 } 2138 x[row] = sum1*(*ibdiag);row--; 2139 break; 2140 2141 case 2: 2142 2143 sum1 = b[row]; 2144 sum2 = b[row-1]; 2145 /* note that sum1 is associated with the second of the two rows */ 2146 v2 = a->a + diag[row-1] + 2; 2147 for(n = 0; n<sz-1; n+=2) { 2148 i1 = idx[0]; 2149 i2 = idx[1]; 2150 idx += 2; 2151 tmp0 = x[i1]; 2152 tmp1 = x[i2]; 2153 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2154 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2155 } 2156 2157 if (n == sz-1){ 2158 tmp0 = x[*idx]; 2159 sum1 -= *v1*tmp0; 2160 sum2 -= *v2*tmp0; 2161 } 2162 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2163 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2164 row -= 2; 2165 break; 2166 case 3: 2167 2168 sum1 = b[row]; 2169 sum2 = b[row-1]; 2170 sum3 = b[row-2]; 2171 v2 = a->a + diag[row-1] + 2; 2172 v3 = a->a + diag[row-2] + 3; 2173 for(n = 0; n<sz-1; n+=2) { 2174 i1 = idx[0]; 2175 i2 = idx[1]; 2176 idx += 2; 2177 tmp0 = x[i1]; 2178 tmp1 = x[i2]; 2179 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2180 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2181 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2182 } 2183 2184 if (n == sz-1){ 2185 tmp0 = x[*idx]; 2186 sum1 -= *v1*tmp0; 2187 sum2 -= *v2*tmp0; 2188 sum3 -= *v3*tmp0; 2189 } 2190 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2191 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2192 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2193 row -= 3; 2194 break; 2195 case 4: 2196 2197 sum1 = b[row]; 2198 sum2 = b[row-1]; 2199 sum3 = b[row-2]; 2200 sum4 = b[row-3]; 2201 v2 = a->a + diag[row-1] + 2; 2202 v3 = a->a + diag[row-2] + 3; 2203 v4 = a->a + diag[row-3] + 4; 2204 for(n = 0; n<sz-1; n+=2) { 2205 i1 = idx[0]; 2206 i2 = idx[1]; 2207 idx += 2; 2208 tmp0 = x[i1]; 2209 tmp1 = x[i2]; 2210 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2211 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2212 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2213 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2214 } 2215 2216 if (n == sz-1){ 2217 tmp0 = x[*idx]; 2218 sum1 -= *v1*tmp0; 2219 sum2 -= *v2*tmp0; 2220 sum3 -= *v3*tmp0; 2221 sum4 -= *v4*tmp0; 2222 } 2223 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2224 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2225 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2226 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2227 row -= 4; 2228 break; 2229 case 5: 2230 2231 sum1 = b[row]; 2232 sum2 = b[row-1]; 2233 sum3 = b[row-2]; 2234 sum4 = b[row-3]; 2235 sum5 = b[row-4]; 2236 v2 = a->a + diag[row-1] + 2; 2237 v3 = a->a + diag[row-2] + 3; 2238 v4 = a->a + diag[row-3] + 4; 2239 v5 = a->a + diag[row-4] + 5; 2240 for(n = 0; n<sz-1; n+=2) { 2241 i1 = idx[0]; 2242 i2 = idx[1]; 2243 idx += 2; 2244 tmp0 = x[i1]; 2245 tmp1 = x[i2]; 2246 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2247 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2248 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2249 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2250 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2251 } 2252 2253 if (n == sz-1){ 2254 tmp0 = x[*idx]; 2255 sum1 -= *v1*tmp0; 2256 sum2 -= *v2*tmp0; 2257 sum3 -= *v3*tmp0; 2258 sum4 -= *v4*tmp0; 2259 sum5 -= *v5*tmp0; 2260 } 2261 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2262 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2263 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2264 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2265 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2266 row -= 5; 2267 break; 2268 default: 2269 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2270 } 2271 CHKMEMQ; 2272 } 2273 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2274 2275 /* 2276 t = b - D x where D is the block diagonal 2277 */ 2278 cnt = 0; 2279 for (i=0, row=0; i<m; i++) { 2280 CHKMEMQ; 2281 switch (sizes[i]){ 2282 case 1: 2283 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2284 break; 2285 case 2: 2286 x1 = x[row]; x2 = x[row+1]; 2287 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2288 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2289 t[row] = b[row] - tmp1; 2290 t[row+1] = b[row+1] - tmp2; row += 2; 2291 cnt += 4; 2292 break; 2293 case 3: 2294 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2295 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2296 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2297 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2298 t[row] = b[row] - tmp1; 2299 t[row+1] = b[row+1] - tmp2; 2300 t[row+2] = b[row+2] - tmp3; row += 3; 2301 cnt += 9; 2302 break; 2303 case 4: 2304 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2305 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2306 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2307 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2308 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2309 t[row] = b[row] - tmp1; 2310 t[row+1] = b[row+1] - tmp2; 2311 t[row+2] = b[row+2] - tmp3; 2312 t[row+3] = b[row+3] - tmp4; row += 4; 2313 cnt += 16; 2314 break; 2315 case 5: 2316 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2317 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2318 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2319 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2320 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2321 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2322 t[row] = b[row] - tmp1; 2323 t[row+1] = b[row+1] - tmp2; 2324 t[row+2] = b[row+2] - tmp3; 2325 t[row+3] = b[row+3] - tmp4; 2326 t[row+4] = b[row+4] - tmp5;row += 5; 2327 cnt += 25; 2328 break; 2329 default: 2330 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2331 } 2332 CHKMEMQ; 2333 } 2334 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2335 2336 2337 2338 /* 2339 Apply (L + D)^-1 where D is the block diagonal 2340 */ 2341 for (i=0, row=0; i<m; i++) { 2342 sz = diag[row] - ii[row]; 2343 v1 = a->a + ii[row]; 2344 idx = a->j + ii[row]; 2345 CHKMEMQ; 2346 /* see comments for MatMult_Inode() for how this is coded */ 2347 switch (sizes[i]){ 2348 case 1: 2349 2350 sum1 = t[row]; 2351 for(n = 0; n<sz-1; n+=2) { 2352 i1 = idx[0]; 2353 i2 = idx[1]; 2354 idx += 2; 2355 tmp0 = t[i1]; 2356 tmp1 = t[i2]; 2357 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2358 } 2359 2360 if (n == sz-1){ 2361 tmp0 = t[*idx]; 2362 sum1 -= *v1 * tmp0; 2363 } 2364 x[row] += t[row] = sum1*(*ibdiag++); row++; 2365 break; 2366 case 2: 2367 v2 = a->a + ii[row+1]; 2368 sum1 = t[row]; 2369 sum2 = t[row+1]; 2370 for(n = 0; n<sz-1; n+=2) { 2371 i1 = idx[0]; 2372 i2 = idx[1]; 2373 idx += 2; 2374 tmp0 = t[i1]; 2375 tmp1 = t[i2]; 2376 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2377 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2378 } 2379 2380 if (n == sz-1){ 2381 tmp0 = t[*idx]; 2382 sum1 -= v1[0] * tmp0; 2383 sum2 -= v2[0] * tmp0; 2384 } 2385 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2386 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2387 ibdiag += 4; row += 2; 2388 break; 2389 case 3: 2390 v2 = a->a + ii[row+1]; 2391 v3 = a->a + ii[row+2]; 2392 sum1 = t[row]; 2393 sum2 = t[row+1]; 2394 sum3 = t[row+2]; 2395 for(n = 0; n<sz-1; n+=2) { 2396 i1 = idx[0]; 2397 i2 = idx[1]; 2398 idx += 2; 2399 tmp0 = t[i1]; 2400 tmp1 = t[i2]; 2401 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2402 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2403 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2404 } 2405 2406 if (n == sz-1){ 2407 tmp0 = t[*idx]; 2408 sum1 -= v1[0] * tmp0; 2409 sum2 -= v2[0] * tmp0; 2410 sum3 -= v3[0] * tmp0; 2411 } 2412 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2413 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2414 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2415 ibdiag += 9; row += 3; 2416 break; 2417 case 4: 2418 v2 = a->a + ii[row+1]; 2419 v3 = a->a + ii[row+2]; 2420 v4 = a->a + ii[row+3]; 2421 sum1 = t[row]; 2422 sum2 = t[row+1]; 2423 sum3 = t[row+2]; 2424 sum4 = t[row+3]; 2425 for(n = 0; n<sz-1; n+=2) { 2426 i1 = idx[0]; 2427 i2 = idx[1]; 2428 idx += 2; 2429 tmp0 = t[i1]; 2430 tmp1 = t[i2]; 2431 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2432 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2433 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2434 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2435 } 2436 2437 if (n == sz-1){ 2438 tmp0 = t[*idx]; 2439 sum1 -= v1[0] * tmp0; 2440 sum2 -= v2[0] * tmp0; 2441 sum3 -= v3[0] * tmp0; 2442 sum4 -= v4[0] * tmp0; 2443 } 2444 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2445 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2446 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2447 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2448 ibdiag += 16; row += 4; 2449 break; 2450 case 5: 2451 v2 = a->a + ii[row+1]; 2452 v3 = a->a + ii[row+2]; 2453 v4 = a->a + ii[row+3]; 2454 v5 = a->a + ii[row+4]; 2455 sum1 = t[row]; 2456 sum2 = t[row+1]; 2457 sum3 = t[row+2]; 2458 sum4 = t[row+3]; 2459 sum5 = t[row+4]; 2460 for(n = 0; n<sz-1; n+=2) { 2461 i1 = idx[0]; 2462 i2 = idx[1]; 2463 idx += 2; 2464 tmp0 = t[i1]; 2465 tmp1 = t[i2]; 2466 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2467 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2468 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2469 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2470 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2471 } 2472 2473 if (n == sz-1){ 2474 tmp0 = t[*idx]; 2475 sum1 -= v1[0] * tmp0; 2476 sum2 -= v2[0] * tmp0; 2477 sum3 -= v3[0] * tmp0; 2478 sum4 -= v4[0] * tmp0; 2479 sum5 -= v5[0] * tmp0; 2480 } 2481 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2482 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2483 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2484 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2485 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2486 ibdiag += 25; row += 5; 2487 break; 2488 default: 2489 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2490 } 2491 CHKMEMQ; 2492 } 2493 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2494 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2495 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2496 } 2497 PetscFunctionReturn(0); 2498 } 2499 2500 #undef __FUNCT__ 2501 #define __FUNCT__ "MatMultDiagonalBlock_Inode" 2502 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx) 2503 { 2504 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2505 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2506 const MatScalar *bdiag = a->inode.bdiag; 2507 const PetscScalar *b; 2508 PetscErrorCode ierr; 2509 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2510 const PetscInt *sizes = a->inode.size; 2511 2512 PetscFunctionBegin; 2513 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2514 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2515 cnt = 0; 2516 for (i=0, row=0; i<m; i++) { 2517 switch (sizes[i]){ 2518 case 1: 2519 x[row] = b[row]*bdiag[cnt++];row++; 2520 break; 2521 case 2: 2522 x1 = b[row]; x2 = b[row+1]; 2523 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2524 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2525 x[row++] = tmp1; 2526 x[row++] = tmp2; 2527 cnt += 4; 2528 break; 2529 case 3: 2530 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2531 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2532 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2533 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2534 x[row++] = tmp1; 2535 x[row++] = tmp2; 2536 x[row++] = tmp3; 2537 cnt += 9; 2538 break; 2539 case 4: 2540 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2541 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2542 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2543 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2544 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2545 x[row++] = tmp1; 2546 x[row++] = tmp2; 2547 x[row++] = tmp3; 2548 x[row++] = tmp4; 2549 cnt += 16; 2550 break; 2551 case 5: 2552 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2553 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2554 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2555 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2556 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2557 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2558 x[row++] = tmp1; 2559 x[row++] = tmp2; 2560 x[row++] = tmp3; 2561 x[row++] = tmp4; 2562 x[row++] = tmp5; 2563 cnt += 25; 2564 break; 2565 default: 2566 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2567 } 2568 } 2569 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2570 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2571 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2572 PetscFunctionReturn(0); 2573 } 2574 2575 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec); 2576 /* 2577 samestructure indicates that the matrix has not changed its nonzero structure so we 2578 do not need to recompute the inodes 2579 */ 2580 #undef __FUNCT__ 2581 #define __FUNCT__ "Mat_CheckInode" 2582 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2583 { 2584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2585 PetscErrorCode ierr; 2586 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2587 PetscTruth flag; 2588 2589 PetscFunctionBegin; 2590 if (!a->inode.use) PetscFunctionReturn(0); 2591 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2592 2593 2594 m = A->rmap->n; 2595 if (a->inode.size) {ns = a->inode.size;} 2596 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2597 2598 i = 0; 2599 node_count = 0; 2600 idx = a->j; 2601 ii = a->i; 2602 while (i < m){ /* For each row */ 2603 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2604 /* Limits the number of elements in a node to 'a->inode.limit' */ 2605 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2606 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2607 if (nzy != nzx) break; 2608 idy += nzx; /* Same nonzero pattern */ 2609 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2610 if (!flag) break; 2611 } 2612 ns[node_count++] = blk_size; 2613 idx += blk_size*nzx; 2614 i = j; 2615 } 2616 /* If not enough inodes found,, do not use inode version of the routines */ 2617 if (!a->inode.size && m && node_count > .9*m) { 2618 ierr = PetscFree(ns);CHKERRQ(ierr); 2619 a->inode.node_count = 0; 2620 a->inode.size = PETSC_NULL; 2621 a->inode.use = PETSC_FALSE; 2622 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2623 } else { 2624 A->ops->mult = MatMult_Inode; 2625 A->ops->relax = MatRelax_Inode; 2626 A->ops->multadd = MatMultAdd_Inode; 2627 A->ops->getrowij = MatGetRowIJ_Inode; 2628 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2629 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2630 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2631 A->ops->coloringpatch = MatColoringPatch_Inode; 2632 A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode; 2633 a->inode.node_count = node_count; 2634 a->inode.size = ns; 2635 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2636 } 2637 PetscFunctionReturn(0); 2638 } 2639 2640 /* 2641 This is really ugly. if inodes are used this replaces the 2642 permutations with ones that correspond to rows/cols of the matrix 2643 rather then inode blocks 2644 */ 2645 #undef __FUNCT__ 2646 #define __FUNCT__ "MatInodeAdjustForInodes" 2647 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2648 { 2649 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2650 2651 PetscFunctionBegin; 2652 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2653 if (f) { 2654 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2655 } 2656 PetscFunctionReturn(0); 2657 } 2658 2659 EXTERN_C_BEGIN 2660 #undef __FUNCT__ 2661 #define __FUNCT__ "MatAdjustForInodes_Inode" 2662 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2663 { 2664 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2665 PetscErrorCode ierr; 2666 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2667 const PetscInt *ridx,*cidx; 2668 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2669 PetscInt nslim_col,*ns_col; 2670 IS ris = *rperm,cis = *cperm; 2671 2672 PetscFunctionBegin; 2673 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2674 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2675 2676 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2677 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2678 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2679 permc = permr + m; 2680 2681 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2682 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2683 2684 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2685 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2686 2687 /* Construct the permutations for rows*/ 2688 for (i=0,row = 0; i<nslim_row; ++i){ 2689 indx = ridx[i]; 2690 start_val = tns[indx]; 2691 end_val = tns[indx + 1]; 2692 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2693 } 2694 2695 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2696 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2697 2698 /* Construct permutations for columns */ 2699 for (i=0,col=0; i<nslim_col; ++i){ 2700 indx = cidx[i]; 2701 start_val = tns[indx]; 2702 end_val = tns[indx + 1]; 2703 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2704 } 2705 2706 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2707 ISSetPermutation(*rperm); 2708 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2709 ISSetPermutation(*cperm); 2710 2711 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2712 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2713 2714 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2715 ierr = PetscFree(permr);CHKERRQ(ierr); 2716 ierr = ISDestroy(cis);CHKERRQ(ierr); 2717 ierr = ISDestroy(ris);CHKERRQ(ierr); 2718 ierr = PetscFree(tns);CHKERRQ(ierr); 2719 PetscFunctionReturn(0); 2720 } 2721 EXTERN_C_END 2722 2723 #undef __FUNCT__ 2724 #define __FUNCT__ "MatInodeGetInodeSizes" 2725 /*@C 2726 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2727 2728 Collective on Mat 2729 2730 Input Parameter: 2731 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2732 2733 Output Parameter: 2734 + node_count - no of inodes present in the matrix. 2735 . sizes - an array of size node_count,with sizes of each inode. 2736 - limit - the max size used to generate the inodes. 2737 2738 Level: advanced 2739 2740 Notes: This routine returns some internal storage information 2741 of the matrix, it is intended to be used by advanced users. 2742 It should be called after the matrix is assembled. 2743 The contents of the sizes[] array should not be changed. 2744 PETSC_NULL may be passed for information not requested. 2745 2746 .keywords: matrix, seqaij, get, inode 2747 2748 .seealso: MatGetInfo() 2749 @*/ 2750 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2751 { 2752 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2753 2754 PetscFunctionBegin; 2755 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2756 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2757 if (f) { 2758 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2759 } 2760 PetscFunctionReturn(0); 2761 } 2762 2763 EXTERN_C_BEGIN 2764 #undef __FUNCT__ 2765 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2766 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2767 { 2768 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2769 2770 PetscFunctionBegin; 2771 if (node_count) *node_count = a->inode.node_count; 2772 if (sizes) *sizes = a->inode.size; 2773 if (limit) *limit = a->inode.limit; 2774 PetscFunctionReturn(0); 2775 } 2776 EXTERN_C_END 2777