1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_Inode" 586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_Inode" 779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 const PetscInt *r,*c,*rout,*cout; 785 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 786 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 821 for(j=0; j<nz-1; j+=2){ 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 } 829 if(j == nz-1){ 830 tmp0 = tmps[*vi++]; 831 sum1 -= *v1++ *tmp0; 832 } 833 tmp[row ++]=sum1; 834 break; 835 case 2: 836 sum1 = b[*r++]; 837 sum2 = b[*r++]; 838 v2 = aa + ai[row+1]; 839 840 for(j=0; j<nz-1; j+=2){ 841 i0 = vi[0]; 842 i1 = vi[1]; 843 vi +=2; 844 tmp0 = tmps[i0]; 845 tmp1 = tmps[i1]; 846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 848 } 849 if(j == nz-1){ 850 tmp0 = tmps[*vi++]; 851 sum1 -= *v1++ *tmp0; 852 sum2 -= *v2++ *tmp0; 853 } 854 sum2 -= *v2++ * sum1; 855 tmp[row ++]=sum1; 856 tmp[row ++]=sum2; 857 break; 858 case 3: 859 sum1 = b[*r++]; 860 sum2 = b[*r++]; 861 sum3 = b[*r++]; 862 v2 = aa + ai[row+1]; 863 v3 = aa + ai[row+2]; 864 865 for (j=0; j<nz-1; j+=2){ 866 i0 = vi[0]; 867 i1 = vi[1]; 868 vi +=2; 869 tmp0 = tmps[i0]; 870 tmp1 = tmps[i1]; 871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 874 } 875 if (j == nz-1){ 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ *tmp0; 878 sum2 -= *v2++ *tmp0; 879 sum3 -= *v3++ *tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 tmp[row ++]=sum1; 885 tmp[row ++]=sum2; 886 tmp[row ++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2){ 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1){ 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row ++]=sum1; 924 tmp[row ++]=sum2; 925 tmp[row ++]=sum3; 926 tmp[row ++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2){ 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1){ 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row ++]=sum1; 972 tmp[row ++]=sum2; 973 tmp[row ++]=sum3; 974 tmp[row ++]=sum4; 975 tmp[row ++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz){ /* Each loop in 'case' is unrolled */ 989 case 1 : 990 sum1 = tmp[row]; 991 992 for(j=nz ; j>1; j-=2){ 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1){ 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2 : 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz ; j>1; j-=2){ 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1){ 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3 : 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz ; j>1; j-=2){ 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1){ 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4 : 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz ; j>1; j-=2){ 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1){ 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5 : 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz ; j>1; j-=2){ 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1){ 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1174 /* 1175 Not currently used because the basic version is often a good amount faster 1176 */ 1177 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info) 1178 { 1179 Mat C = B; 1180 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1181 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1182 PetscErrorCode ierr; 1183 const PetscInt *ns,*r,*ic,*c,*ics,*bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,*bi = b->i,*ai = a->i,*aj = a->j,*bd = b->diag,*pj; 1184 PetscInt n = A->rmap->n; 1185 PetscInt nz,nz_tmp,row,prow; 1186 PetscInt i,j,idx,node_max,nodesz; 1187 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1188 PetscScalar mul1,mul2,mul3,tmp; 1189 const MatScalar *pv,*v1,*v2,*v3,*aa = a->a,*rtmp1; 1190 MatScalar *rtmp11,*pc1,*pc2,*pc3,*ba = b->a,*rtmp22,*rtmp33;; 1191 PetscReal rs=0.0; 1192 LUShift_Ctx sctx; 1193 PetscInt newshift; 1194 1195 PetscFunctionBegin; 1196 sctx.shift_top = 0; 1197 sctx.nshift_max = 0; 1198 sctx.shift_lo = 0; 1199 sctx.shift_hi = 0; 1200 sctx.shift_fraction = 0; 1201 1202 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1203 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1204 sctx.shift_top = 0; 1205 for (i=0; i<n; i++) { 1206 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1207 rs = 0.0; 1208 ajtmp = aj + ai[i]; 1209 rtmp1 = aa + ai[i]; 1210 nz = ai[i+1] - ai[i]; 1211 for (j=0; j<nz; j++){ 1212 if (*ajtmp != i){ 1213 rs += PetscAbsScalar(*rtmp1++); 1214 } else { 1215 rs -= PetscRealPart(*rtmp1++); 1216 } 1217 ajtmp++; 1218 } 1219 if (rs>sctx.shift_top) sctx.shift_top = rs; 1220 } 1221 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1222 sctx.shift_top *= 1.1; 1223 sctx.nshift_max = 5; 1224 sctx.shift_lo = 0.; 1225 sctx.shift_hi = 1.; 1226 } 1227 sctx.shift_amount = 0; 1228 sctx.nshift = 0; 1229 1230 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1231 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1232 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1233 ierr = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr); 1234 ierr = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr); 1235 ierr = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr); 1236 ierr = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr); 1237 ics = ic ; 1238 1239 node_max = a->inode.node_count; 1240 ns = a->inode.size; 1241 if (!ns){ 1242 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1243 } 1244 1245 /* If max inode size > 3, split it into two inodes.*/ 1246 /* also map the inode sizes according to the ordering */ 1247 ierr = PetscMalloc2(n,PetscInt,&tmp_vec1,n,PetscInt,&nsmap);CHKERRQ(ierr); 1248 for (i=0,j=0; i<node_max; ++i,++j){ 1249 if (ns[i]>3) { 1250 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1251 ++j; 1252 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1253 } else { 1254 tmp_vec1[j] = ns[i]; 1255 } 1256 } 1257 /* Use the correct node_max */ 1258 node_max = j; 1259 1260 /* Now reorder the inode info based on mat re-ordering info */ 1261 /* First create a row -> inode_size_array_index map */ 1262 ierr = PetscMalloc(node_max*sizeof(PetscInt),&tmp_vec2);CHKERRQ(ierr); 1263 for (i=0,row=0; i<node_max; i++) { 1264 nodesz = tmp_vec1[i]; 1265 for (j=0; j<nodesz; j++,row++) { 1266 nsmap[row] = i; 1267 } 1268 } 1269 /* Using nsmap, create a reordered ns structure */ 1270 for (i=0,j=0; i< node_max; i++) { 1271 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1272 tmp_vec2[i] = nodesz; 1273 j += nodesz; 1274 } 1275 ierr = PetscFree2(tmp_vec1,nsmap);CHKERRQ(ierr); 1276 /* Now use the correct ns */ 1277 ns = tmp_vec2; 1278 1279 do { 1280 sctx.lushift = PETSC_FALSE; 1281 /* Now loop over each block-row, and do the factorization */ 1282 for (i=0,row=0; i<node_max; i++) { 1283 nodesz = ns[i]; 1284 nz = bi[row+1] - bi[row]; 1285 bjtmp = bj + bi[row]; 1286 1287 switch (nodesz){ 1288 case 1: 1289 for (j=0; j<nz; j++){ 1290 rtmp11[bjtmp[j]] = 0.0; 1291 } 1292 1293 /* load in initial (unfactored row) */ 1294 idx = r[row]; 1295 nz_tmp = ai[idx+1] - ai[idx]; 1296 ajtmp = aj + ai[idx]; 1297 v1 = aa + ai[idx]; 1298 1299 for (j=0; j<nz_tmp; j++) { 1300 idx = ics[ajtmp[j]]; 1301 rtmp11[idx] = v1[j]; 1302 } 1303 rtmp11[ics[r[row]]] += sctx.shift_amount; 1304 1305 prow = *bjtmp++ ; 1306 while (prow < row) { 1307 pc1 = rtmp11 + prow; 1308 if (*pc1 != 0.0){ 1309 pv = ba + bd[prow]; 1310 pj = nbj + bd[prow]; 1311 mul1 = *pc1 * *pv++; 1312 *pc1 = mul1; 1313 nz_tmp = bi[prow+1] - bd[prow] - 1; 1314 ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr); 1315 for (j=0; j<nz_tmp; j++) { 1316 rtmp11[pj[i]] -= mul1 * pv[j]; 1317 } 1318 } 1319 prow = *bjtmp++ ; 1320 } 1321 pj = bj + bi[row]; 1322 pc1 = ba + bi[row]; 1323 1324 sctx.pv = rtmp11[row]; 1325 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1326 rs = 0.0; 1327 for (j=0; j<nz; j++) { 1328 idx = pj[j]; 1329 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1330 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1331 } 1332 sctx.rs = rs; 1333 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1334 if (newshift == 1) goto endofwhile; 1335 break; 1336 1337 case 2: 1338 for (j=0; j<nz; j++) { 1339 idx = bjtmp[j]; 1340 rtmp11[idx] = 0.0; 1341 rtmp22[idx] = 0.0; 1342 } 1343 1344 /* load in initial (unfactored row) */ 1345 idx = r[row]; 1346 nz_tmp = ai[idx+1] - ai[idx]; 1347 ajtmp = aj + ai[idx]; 1348 v1 = aa + ai[idx]; 1349 v2 = aa + ai[idx+1]; 1350 for (j=0; j<nz_tmp; j++) { 1351 idx = ics[ajtmp[j]]; 1352 rtmp11[idx] = v1[j]; 1353 rtmp22[idx] = v2[j]; 1354 } 1355 rtmp11[ics[r[row]]] += sctx.shift_amount; 1356 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1357 1358 prow = *bjtmp++ ; 1359 while (prow < row) { 1360 pc1 = rtmp11 + prow; 1361 pc2 = rtmp22 + prow; 1362 if (*pc1 != 0.0 || *pc2 != 0.0){ 1363 pv = ba + bd[prow]; 1364 pj = nbj + bd[prow]; 1365 mul1 = *pc1 * *pv; 1366 mul2 = *pc2 * *pv; 1367 ++pv; 1368 *pc1 = mul1; 1369 *pc2 = mul2; 1370 1371 nz_tmp = bi[prow+1] - bd[prow] - 1; 1372 for (j=0; j<nz_tmp; j++) { 1373 tmp = pv[j]; 1374 idx = pj[j]; 1375 rtmp11[idx] -= mul1 * tmp; 1376 rtmp22[idx] -= mul2 * tmp; 1377 } 1378 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1379 } 1380 prow = *bjtmp++ ; 1381 } 1382 1383 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1384 pc1 = rtmp11 + prow; 1385 pc2 = rtmp22 + prow; 1386 1387 sctx.pv = *pc1; 1388 pj = bj + bi[prow]; 1389 rs = 0.0; 1390 for (j=0; j<nz; j++){ 1391 idx = pj[j]; 1392 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1393 } 1394 sctx.rs = rs; 1395 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1396 if (newshift == 1) goto endofwhile; 1397 1398 if (*pc2 != 0.0){ 1399 pj = nbj + bd[prow]; 1400 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1401 *pc2 = mul2; 1402 nz_tmp = bi[prow+1] - bd[prow] - 1; 1403 for (j=0; j<nz_tmp; j++) { 1404 idx = pj[j] ; 1405 tmp = rtmp11[idx]; 1406 rtmp22[idx] -= mul2 * tmp; 1407 } 1408 ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr); 1409 } 1410 1411 pj = bj + bi[row]; 1412 pc1 = ba + bi[row]; 1413 pc2 = ba + bi[row+1]; 1414 1415 sctx.pv = rtmp22[row+1]; 1416 rs = 0.0; 1417 rtmp11[row] = 1.0/rtmp11[row]; 1418 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1419 /* copy row entries from dense representation to sparse */ 1420 for (j=0; j<nz; j++) { 1421 idx = pj[j]; 1422 pc1[j] = rtmp11[idx]; 1423 pc2[j] = rtmp22[idx]; 1424 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1425 } 1426 sctx.rs = rs; 1427 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1428 if (newshift == 1) goto endofwhile; 1429 break; 1430 1431 case 3: 1432 for (j=0; j<nz; j++) { 1433 idx = bjtmp[j]; 1434 rtmp11[idx] = 0.0; 1435 rtmp22[idx] = 0.0; 1436 rtmp33[idx] = 0.0; 1437 } 1438 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1439 idx = r[row]; 1440 nz_tmp = ai[idx+1] - ai[idx]; 1441 ajtmp = aj + ai[idx]; 1442 v1 = aa + ai[idx]; 1443 v2 = aa + ai[idx+1]; 1444 v3 = aa + ai[idx+2]; 1445 for (j=0; j<nz_tmp; j++) { 1446 idx = ics[ajtmp[j]]; 1447 rtmp11[idx] = v1[j]; 1448 rtmp22[idx] = v2[j]; 1449 rtmp33[idx] = v3[j]; 1450 } 1451 rtmp11[ics[r[row]]] += sctx.shift_amount; 1452 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1453 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1454 1455 /* loop over all pivot row blocks above this row block */ 1456 prow = *bjtmp++ ; 1457 while (prow < row) { 1458 pc1 = rtmp11 + prow; 1459 pc2 = rtmp22 + prow; 1460 pc3 = rtmp33 + prow; 1461 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1462 pv = ba + bd[prow]; 1463 pj = nbj + bd[prow]; 1464 mul1 = *pc1 * *pv; 1465 mul2 = *pc2 * *pv; 1466 mul3 = *pc3 * *pv; 1467 ++pv; 1468 *pc1 = mul1; 1469 *pc2 = mul2; 1470 *pc3 = mul3; 1471 1472 nz_tmp = bi[prow+1] - bd[prow] - 1; 1473 /* update this row based on pivot row */ 1474 for (j=0; j<nz_tmp; j++) { 1475 tmp = pv[j]; 1476 idx = pj[j]; 1477 rtmp11[idx] -= mul1 * tmp; 1478 rtmp22[idx] -= mul2 * tmp; 1479 rtmp33[idx] -= mul3 * tmp; 1480 } 1481 ierr = PetscLogFlops(6.0*nz_tmp);CHKERRQ(ierr); 1482 } 1483 prow = *bjtmp++ ; 1484 } 1485 1486 /* Now take care of diagonal 3x3 block in this set of rows */ 1487 /* note: prow = row here */ 1488 pc1 = rtmp11 + prow; 1489 pc2 = rtmp22 + prow; 1490 pc3 = rtmp33 + prow; 1491 1492 sctx.pv = *pc1; 1493 pj = bj + bi[prow]; 1494 rs = 0.0; 1495 for (j=0; j<nz; j++){ 1496 idx = pj[j]; 1497 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1498 } 1499 sctx.rs = rs; 1500 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1501 if (newshift == 1) goto endofwhile; 1502 1503 if (*pc2 != 0.0 || *pc3 != 0.0){ 1504 mul2 = (*pc2)/(*pc1); 1505 mul3 = (*pc3)/(*pc1); 1506 *pc2 = mul2; 1507 *pc3 = mul3; 1508 nz_tmp = bi[prow+1] - bd[prow] - 1; 1509 pj = nbj + bd[prow]; 1510 for (j=0; j<nz_tmp; j++) { 1511 idx = pj[j] ; 1512 tmp = rtmp11[idx]; 1513 rtmp22[idx] -= mul2 * tmp; 1514 rtmp33[idx] -= mul3 * tmp; 1515 } 1516 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1517 } 1518 ++prow; 1519 1520 pc2 = rtmp22 + prow; 1521 pc3 = rtmp33 + prow; 1522 sctx.pv = *pc2; 1523 pj = bj + bi[prow]; 1524 rs = 0.0; 1525 for (j=0; j<nz; j++){ 1526 idx = pj[j]; 1527 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1528 } 1529 sctx.rs = rs; 1530 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1531 if (newshift == 1) goto endofwhile; 1532 1533 if (*pc3 != 0.0){ 1534 mul3 = (*pc3)/(*pc2); 1535 *pc3 = mul3; 1536 pj = nbj + bd[prow]; 1537 nz_tmp = bi[prow+1] - bd[prow] - 1; 1538 for (j=0; j<nz_tmp; j++) { 1539 idx = pj[j] ; 1540 tmp = rtmp22[idx]; 1541 rtmp33[idx] -= mul3 * tmp; 1542 } 1543 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1544 } 1545 1546 pj = bj + bi[row]; 1547 pc1 = ba + bi[row]; 1548 pc2 = ba + bi[row+1]; 1549 pc3 = ba + bi[row+2]; 1550 1551 sctx.pv = rtmp33[row+2]; 1552 rs = 0.0; 1553 rtmp11[row] = 1.0/rtmp11[row]; 1554 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1555 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1556 /* copy row entries from dense representation to sparse */ 1557 for (j=0; j<nz; j++) { 1558 idx = pj[j]; 1559 pc1[j] = rtmp11[idx]; 1560 pc2[j] = rtmp22[idx]; 1561 pc3[j] = rtmp33[idx]; 1562 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1563 } 1564 1565 sctx.rs = rs; 1566 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1567 if (newshift == 1) goto endofwhile; 1568 break; 1569 1570 default: 1571 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1572 } 1573 row += nodesz; /* Update the row */ 1574 } 1575 endofwhile:; 1576 } while (sctx.lushift); 1577 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 1578 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1579 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1580 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1581 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1582 (B)->ops->solve = MatSolve_Inode; 1583 /* do not set solve add, since MatSolve_Inode + Add is faster */ 1584 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1585 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1586 C->assembled = PETSC_TRUE; 1587 C->preallocated = PETSC_TRUE; 1588 if (sctx.nshift) { 1589 if (info->shiftpd) { 1590 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1591 } else if (info->shiftnz) { 1592 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1593 } 1594 } 1595 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 /* 1600 Makes a longer coloring[] array and calls the usual code with that 1601 */ 1602 #undef __FUNCT__ 1603 #define __FUNCT__ "MatColoringPatch_Inode" 1604 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1605 { 1606 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1607 PetscErrorCode ierr; 1608 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1609 PetscInt *colorused,i; 1610 ISColoringValue *newcolor; 1611 1612 PetscFunctionBegin; 1613 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1614 /* loop over inodes, marking a color for each column*/ 1615 row = 0; 1616 for (i=0; i<m; i++){ 1617 for (j=0; j<ns[i]; j++) { 1618 newcolor[row++] = coloring[i] + j*ncolors; 1619 } 1620 } 1621 1622 /* eliminate unneeded colors */ 1623 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1624 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1625 for (i=0; i<n; i++) { 1626 colorused[newcolor[i]] = 1; 1627 } 1628 1629 for (i=1; i<5*ncolors; i++) { 1630 colorused[i] += colorused[i-1]; 1631 } 1632 ncolors = colorused[5*ncolors-1]; 1633 for (i=0; i<n; i++) { 1634 newcolor[i] = colorused[newcolor[i]]-1; 1635 } 1636 ierr = PetscFree(colorused);CHKERRQ(ierr); 1637 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1638 ierr = PetscFree(coloring);CHKERRQ(ierr); 1639 PetscFunctionReturn(0); 1640 } 1641 1642 #include "../src/mat/blockinvert.h" 1643 1644 #undef __FUNCT__ 1645 #define __FUNCT__ "MatRelax_Inode" 1646 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1647 { 1648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1649 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1650 MatScalar *ibdiag,*bdiag; 1651 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1652 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1653 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1654 PetscErrorCode ierr; 1655 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1656 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1657 1658 PetscFunctionBegin; 1659 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1660 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1661 if (its > 1) { 1662 /* switch to non-inode version */ 1663 ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1664 PetscFunctionReturn(0); 1665 } 1666 1667 if (!a->inode.ibdiagvalid) { 1668 if (!a->inode.ibdiag) { 1669 /* calculate space needed for diagonal blocks */ 1670 for (i=0; i<m; i++) { 1671 cnt += sizes[i]*sizes[i]; 1672 } 1673 a->inode.bdiagsize = cnt; 1674 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1675 } 1676 1677 /* copy over the diagonal blocks and invert them */ 1678 ibdiag = a->inode.ibdiag; 1679 bdiag = a->inode.bdiag; 1680 cnt = 0; 1681 for (i=0, row = 0; i<m; i++) { 1682 for (j=0; j<sizes[i]; j++) { 1683 for (k=0; k<sizes[i]; k++) { 1684 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1685 } 1686 } 1687 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1688 1689 switch(sizes[i]) { 1690 case 1: 1691 /* Create matrix data structure */ 1692 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1693 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1694 break; 1695 case 2: 1696 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1697 break; 1698 case 3: 1699 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1700 break; 1701 case 4: 1702 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1703 break; 1704 case 5: 1705 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1706 break; 1707 default: 1708 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1709 } 1710 cnt += sizes[i]*sizes[i]; 1711 row += sizes[i]; 1712 } 1713 a->inode.ibdiagvalid = PETSC_TRUE; 1714 } 1715 ibdiag = a->inode.ibdiag; 1716 bdiag = a->inode.bdiag; 1717 1718 1719 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1720 if (flag & SOR_ZERO_INITIAL_GUESS) { 1721 PetscScalar *b; 1722 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1723 if (xx != bb) { 1724 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1725 } else { 1726 b = x; 1727 } 1728 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1729 1730 for (i=0, row=0; i<m; i++) { 1731 sz = diag[row] - ii[row]; 1732 v1 = a->a + ii[row]; 1733 idx = a->j + ii[row]; 1734 1735 /* see comments for MatMult_Inode() for how this is coded */ 1736 switch (sizes[i]){ 1737 case 1: 1738 1739 sum1 = b[row]; 1740 for(n = 0; n<sz-1; n+=2) { 1741 i1 = idx[0]; 1742 i2 = idx[1]; 1743 idx += 2; 1744 tmp0 = x[i1]; 1745 tmp1 = x[i2]; 1746 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1747 } 1748 1749 if (n == sz-1){ 1750 tmp0 = x[*idx]; 1751 sum1 -= *v1 * tmp0; 1752 } 1753 x[row++] = sum1*(*ibdiag++); 1754 break; 1755 case 2: 1756 v2 = a->a + ii[row+1]; 1757 sum1 = b[row]; 1758 sum2 = b[row+1]; 1759 for(n = 0; n<sz-1; n+=2) { 1760 i1 = idx[0]; 1761 i2 = idx[1]; 1762 idx += 2; 1763 tmp0 = x[i1]; 1764 tmp1 = x[i2]; 1765 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1766 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1767 } 1768 1769 if (n == sz-1){ 1770 tmp0 = x[*idx]; 1771 sum1 -= v1[0] * tmp0; 1772 sum2 -= v2[0] * tmp0; 1773 } 1774 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1775 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1776 ibdiag += 4; 1777 break; 1778 case 3: 1779 v2 = a->a + ii[row+1]; 1780 v3 = a->a + ii[row+2]; 1781 sum1 = b[row]; 1782 sum2 = b[row+1]; 1783 sum3 = b[row+2]; 1784 for(n = 0; n<sz-1; n+=2) { 1785 i1 = idx[0]; 1786 i2 = idx[1]; 1787 idx += 2; 1788 tmp0 = x[i1]; 1789 tmp1 = x[i2]; 1790 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1791 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1792 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1793 } 1794 1795 if (n == sz-1){ 1796 tmp0 = x[*idx]; 1797 sum1 -= v1[0] * tmp0; 1798 sum2 -= v2[0] * tmp0; 1799 sum3 -= v3[0] * tmp0; 1800 } 1801 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1802 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1803 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1804 ibdiag += 9; 1805 break; 1806 case 4: 1807 v2 = a->a + ii[row+1]; 1808 v3 = a->a + ii[row+2]; 1809 v4 = a->a + ii[row+3]; 1810 sum1 = b[row]; 1811 sum2 = b[row+1]; 1812 sum3 = b[row+2]; 1813 sum4 = b[row+3]; 1814 for(n = 0; n<sz-1; n+=2) { 1815 i1 = idx[0]; 1816 i2 = idx[1]; 1817 idx += 2; 1818 tmp0 = x[i1]; 1819 tmp1 = x[i2]; 1820 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1821 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1822 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1823 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1824 } 1825 1826 if (n == sz-1){ 1827 tmp0 = x[*idx]; 1828 sum1 -= v1[0] * tmp0; 1829 sum2 -= v2[0] * tmp0; 1830 sum3 -= v3[0] * tmp0; 1831 sum4 -= v4[0] * tmp0; 1832 } 1833 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1834 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1835 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1836 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1837 ibdiag += 16; 1838 break; 1839 case 5: 1840 v2 = a->a + ii[row+1]; 1841 v3 = a->a + ii[row+2]; 1842 v4 = a->a + ii[row+3]; 1843 v5 = a->a + ii[row+4]; 1844 sum1 = b[row]; 1845 sum2 = b[row+1]; 1846 sum3 = b[row+2]; 1847 sum4 = b[row+3]; 1848 sum5 = b[row+4]; 1849 for(n = 0; n<sz-1; n+=2) { 1850 i1 = idx[0]; 1851 i2 = idx[1]; 1852 idx += 2; 1853 tmp0 = x[i1]; 1854 tmp1 = x[i2]; 1855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1857 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1858 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1859 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1860 } 1861 1862 if (n == sz-1){ 1863 tmp0 = x[*idx]; 1864 sum1 -= v1[0] * tmp0; 1865 sum2 -= v2[0] * tmp0; 1866 sum3 -= v3[0] * tmp0; 1867 sum4 -= v4[0] * tmp0; 1868 sum5 -= v5[0] * tmp0; 1869 } 1870 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1871 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1872 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1873 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1874 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1875 ibdiag += 25; 1876 break; 1877 default: 1878 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1879 } 1880 } 1881 1882 xb = x; 1883 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1884 } else xb = b; 1885 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1886 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1887 cnt = 0; 1888 for (i=0, row=0; i<m; i++) { 1889 1890 switch (sizes[i]){ 1891 case 1: 1892 x[row++] *= bdiag[cnt++]; 1893 break; 1894 case 2: 1895 x1 = x[row]; x2 = x[row+1]; 1896 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1897 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1898 x[row++] = tmp1; 1899 x[row++] = tmp2; 1900 cnt += 4; 1901 break; 1902 case 3: 1903 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1904 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1905 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1906 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1907 x[row++] = tmp1; 1908 x[row++] = tmp2; 1909 x[row++] = tmp3; 1910 cnt += 9; 1911 break; 1912 case 4: 1913 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1914 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1915 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1916 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1917 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1918 x[row++] = tmp1; 1919 x[row++] = tmp2; 1920 x[row++] = tmp3; 1921 x[row++] = tmp4; 1922 cnt += 16; 1923 break; 1924 case 5: 1925 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1926 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1927 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1928 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1929 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1930 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1931 x[row++] = tmp1; 1932 x[row++] = tmp2; 1933 x[row++] = tmp3; 1934 x[row++] = tmp4; 1935 x[row++] = tmp5; 1936 cnt += 25; 1937 break; 1938 default: 1939 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1940 } 1941 } 1942 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1943 } 1944 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1945 1946 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1947 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1948 ibdiag -= sizes[i]*sizes[i]; 1949 sz = ii[row+1] - diag[row] - 1; 1950 v1 = a->a + diag[row] + 1; 1951 idx = a->j + diag[row] + 1; 1952 1953 /* see comments for MatMult_Inode() for how this is coded */ 1954 switch (sizes[i]){ 1955 case 1: 1956 1957 sum1 = xb[row]; 1958 for(n = 0; n<sz-1; n+=2) { 1959 i1 = idx[0]; 1960 i2 = idx[1]; 1961 idx += 2; 1962 tmp0 = x[i1]; 1963 tmp1 = x[i2]; 1964 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1965 } 1966 1967 if (n == sz-1){ 1968 tmp0 = x[*idx]; 1969 sum1 -= *v1*tmp0; 1970 } 1971 x[row--] = sum1*(*ibdiag); 1972 break; 1973 1974 case 2: 1975 1976 sum1 = xb[row]; 1977 sum2 = xb[row-1]; 1978 /* note that sum1 is associated with the second of the two rows */ 1979 v2 = a->a + diag[row-1] + 2; 1980 for(n = 0; n<sz-1; n+=2) { 1981 i1 = idx[0]; 1982 i2 = idx[1]; 1983 idx += 2; 1984 tmp0 = x[i1]; 1985 tmp1 = x[i2]; 1986 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1987 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1988 } 1989 1990 if (n == sz-1){ 1991 tmp0 = x[*idx]; 1992 sum1 -= *v1*tmp0; 1993 sum2 -= *v2*tmp0; 1994 } 1995 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1996 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1997 break; 1998 case 3: 1999 2000 sum1 = xb[row]; 2001 sum2 = xb[row-1]; 2002 sum3 = xb[row-2]; 2003 v2 = a->a + diag[row-1] + 2; 2004 v3 = a->a + diag[row-2] + 3; 2005 for(n = 0; n<sz-1; n+=2) { 2006 i1 = idx[0]; 2007 i2 = idx[1]; 2008 idx += 2; 2009 tmp0 = x[i1]; 2010 tmp1 = x[i2]; 2011 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2012 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2013 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2014 } 2015 2016 if (n == sz-1){ 2017 tmp0 = x[*idx]; 2018 sum1 -= *v1*tmp0; 2019 sum2 -= *v2*tmp0; 2020 sum3 -= *v3*tmp0; 2021 } 2022 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2023 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2024 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2025 break; 2026 case 4: 2027 2028 sum1 = xb[row]; 2029 sum2 = xb[row-1]; 2030 sum3 = xb[row-2]; 2031 sum4 = xb[row-3]; 2032 v2 = a->a + diag[row-1] + 2; 2033 v3 = a->a + diag[row-2] + 3; 2034 v4 = a->a + diag[row-3] + 4; 2035 for(n = 0; n<sz-1; n+=2) { 2036 i1 = idx[0]; 2037 i2 = idx[1]; 2038 idx += 2; 2039 tmp0 = x[i1]; 2040 tmp1 = x[i2]; 2041 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2042 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2043 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2044 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2045 } 2046 2047 if (n == sz-1){ 2048 tmp0 = x[*idx]; 2049 sum1 -= *v1*tmp0; 2050 sum2 -= *v2*tmp0; 2051 sum3 -= *v3*tmp0; 2052 sum4 -= *v4*tmp0; 2053 } 2054 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2055 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2056 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2057 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2058 break; 2059 case 5: 2060 2061 sum1 = xb[row]; 2062 sum2 = xb[row-1]; 2063 sum3 = xb[row-2]; 2064 sum4 = xb[row-3]; 2065 sum5 = xb[row-4]; 2066 v2 = a->a + diag[row-1] + 2; 2067 v3 = a->a + diag[row-2] + 3; 2068 v4 = a->a + diag[row-3] + 4; 2069 v5 = a->a + diag[row-4] + 5; 2070 for(n = 0; n<sz-1; n+=2) { 2071 i1 = idx[0]; 2072 i2 = idx[1]; 2073 idx += 2; 2074 tmp0 = x[i1]; 2075 tmp1 = x[i2]; 2076 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2077 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2078 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2079 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2080 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2081 } 2082 2083 if (n == sz-1){ 2084 tmp0 = x[*idx]; 2085 sum1 -= *v1*tmp0; 2086 sum2 -= *v2*tmp0; 2087 sum3 -= *v3*tmp0; 2088 sum4 -= *v4*tmp0; 2089 sum5 -= *v5*tmp0; 2090 } 2091 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2092 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2093 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2094 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2095 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2096 break; 2097 default: 2098 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2099 } 2100 } 2101 2102 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2103 } 2104 its--; 2105 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2106 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2107 } 2108 if (flag & SOR_EISENSTAT) { 2109 const PetscScalar *b; 2110 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2111 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2112 MatScalar *t = a->inode.ssor_work; 2113 /* 2114 Apply (U + D)^-1 where D is now the block diagonal 2115 */ 2116 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2117 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2118 ibdiag -= sizes[i]*sizes[i]; 2119 sz = ii[row+1] - diag[row] - 1; 2120 v1 = a->a + diag[row] + 1; 2121 idx = a->j + diag[row] + 1; 2122 CHKMEMQ; 2123 /* see comments for MatMult_Inode() for how this is coded */ 2124 switch (sizes[i]){ 2125 case 1: 2126 2127 sum1 = b[row]; 2128 for(n = 0; n<sz-1; n+=2) { 2129 i1 = idx[0]; 2130 i2 = idx[1]; 2131 idx += 2; 2132 tmp0 = x[i1]; 2133 tmp1 = x[i2]; 2134 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2135 } 2136 2137 if (n == sz-1){ 2138 tmp0 = x[*idx]; 2139 sum1 -= *v1*tmp0; 2140 } 2141 x[row] = sum1*(*ibdiag);row--; 2142 break; 2143 2144 case 2: 2145 2146 sum1 = b[row]; 2147 sum2 = b[row-1]; 2148 /* note that sum1 is associated with the second of the two rows */ 2149 v2 = a->a + diag[row-1] + 2; 2150 for(n = 0; n<sz-1; n+=2) { 2151 i1 = idx[0]; 2152 i2 = idx[1]; 2153 idx += 2; 2154 tmp0 = x[i1]; 2155 tmp1 = x[i2]; 2156 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2157 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2158 } 2159 2160 if (n == sz-1){ 2161 tmp0 = x[*idx]; 2162 sum1 -= *v1*tmp0; 2163 sum2 -= *v2*tmp0; 2164 } 2165 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2166 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2167 row -= 2; 2168 break; 2169 case 3: 2170 2171 sum1 = b[row]; 2172 sum2 = b[row-1]; 2173 sum3 = b[row-2]; 2174 v2 = a->a + diag[row-1] + 2; 2175 v3 = a->a + diag[row-2] + 3; 2176 for(n = 0; n<sz-1; n+=2) { 2177 i1 = idx[0]; 2178 i2 = idx[1]; 2179 idx += 2; 2180 tmp0 = x[i1]; 2181 tmp1 = x[i2]; 2182 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2183 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2184 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2185 } 2186 2187 if (n == sz-1){ 2188 tmp0 = x[*idx]; 2189 sum1 -= *v1*tmp0; 2190 sum2 -= *v2*tmp0; 2191 sum3 -= *v3*tmp0; 2192 } 2193 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2194 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2195 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2196 row -= 3; 2197 break; 2198 case 4: 2199 2200 sum1 = b[row]; 2201 sum2 = b[row-1]; 2202 sum3 = b[row-2]; 2203 sum4 = b[row-3]; 2204 v2 = a->a + diag[row-1] + 2; 2205 v3 = a->a + diag[row-2] + 3; 2206 v4 = a->a + diag[row-3] + 4; 2207 for(n = 0; n<sz-1; n+=2) { 2208 i1 = idx[0]; 2209 i2 = idx[1]; 2210 idx += 2; 2211 tmp0 = x[i1]; 2212 tmp1 = x[i2]; 2213 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2214 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2215 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2216 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2217 } 2218 2219 if (n == sz-1){ 2220 tmp0 = x[*idx]; 2221 sum1 -= *v1*tmp0; 2222 sum2 -= *v2*tmp0; 2223 sum3 -= *v3*tmp0; 2224 sum4 -= *v4*tmp0; 2225 } 2226 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2227 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2228 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2229 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2230 row -= 4; 2231 break; 2232 case 5: 2233 2234 sum1 = b[row]; 2235 sum2 = b[row-1]; 2236 sum3 = b[row-2]; 2237 sum4 = b[row-3]; 2238 sum5 = b[row-4]; 2239 v2 = a->a + diag[row-1] + 2; 2240 v3 = a->a + diag[row-2] + 3; 2241 v4 = a->a + diag[row-3] + 4; 2242 v5 = a->a + diag[row-4] + 5; 2243 for(n = 0; n<sz-1; n+=2) { 2244 i1 = idx[0]; 2245 i2 = idx[1]; 2246 idx += 2; 2247 tmp0 = x[i1]; 2248 tmp1 = x[i2]; 2249 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2250 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2251 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2252 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2253 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2254 } 2255 2256 if (n == sz-1){ 2257 tmp0 = x[*idx]; 2258 sum1 -= *v1*tmp0; 2259 sum2 -= *v2*tmp0; 2260 sum3 -= *v3*tmp0; 2261 sum4 -= *v4*tmp0; 2262 sum5 -= *v5*tmp0; 2263 } 2264 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2265 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2266 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2267 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2268 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2269 row -= 5; 2270 break; 2271 default: 2272 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2273 } 2274 CHKMEMQ; 2275 } 2276 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2277 2278 /* 2279 t = b - D x where D is the block diagonal 2280 */ 2281 cnt = 0; 2282 for (i=0, row=0; i<m; i++) { 2283 CHKMEMQ; 2284 switch (sizes[i]){ 2285 case 1: 2286 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2287 break; 2288 case 2: 2289 x1 = x[row]; x2 = x[row+1]; 2290 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2291 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2292 t[row] = b[row] - tmp1; 2293 t[row+1] = b[row+1] - tmp2; row += 2; 2294 cnt += 4; 2295 break; 2296 case 3: 2297 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2298 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2299 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2300 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2301 t[row] = b[row] - tmp1; 2302 t[row+1] = b[row+1] - tmp2; 2303 t[row+2] = b[row+2] - tmp3; row += 3; 2304 cnt += 9; 2305 break; 2306 case 4: 2307 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2308 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2309 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2310 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2311 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2312 t[row] = b[row] - tmp1; 2313 t[row+1] = b[row+1] - tmp2; 2314 t[row+2] = b[row+2] - tmp3; 2315 t[row+3] = b[row+3] - tmp4; row += 4; 2316 cnt += 16; 2317 break; 2318 case 5: 2319 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2320 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2321 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2322 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2323 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2324 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2325 t[row] = b[row] - tmp1; 2326 t[row+1] = b[row+1] - tmp2; 2327 t[row+2] = b[row+2] - tmp3; 2328 t[row+3] = b[row+3] - tmp4; 2329 t[row+4] = b[row+4] - tmp5;row += 5; 2330 cnt += 25; 2331 break; 2332 default: 2333 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2334 } 2335 CHKMEMQ; 2336 } 2337 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2338 2339 2340 2341 /* 2342 Apply (L + D)^-1 where D is the block diagonal 2343 */ 2344 for (i=0, row=0; i<m; i++) { 2345 sz = diag[row] - ii[row]; 2346 v1 = a->a + ii[row]; 2347 idx = a->j + ii[row]; 2348 CHKMEMQ; 2349 /* see comments for MatMult_Inode() for how this is coded */ 2350 switch (sizes[i]){ 2351 case 1: 2352 2353 sum1 = t[row]; 2354 for(n = 0; n<sz-1; n+=2) { 2355 i1 = idx[0]; 2356 i2 = idx[1]; 2357 idx += 2; 2358 tmp0 = t[i1]; 2359 tmp1 = t[i2]; 2360 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2361 } 2362 2363 if (n == sz-1){ 2364 tmp0 = t[*idx]; 2365 sum1 -= *v1 * tmp0; 2366 } 2367 x[row] += t[row] = sum1*(*ibdiag++); row++; 2368 break; 2369 case 2: 2370 v2 = a->a + ii[row+1]; 2371 sum1 = t[row]; 2372 sum2 = t[row+1]; 2373 for(n = 0; n<sz-1; n+=2) { 2374 i1 = idx[0]; 2375 i2 = idx[1]; 2376 idx += 2; 2377 tmp0 = t[i1]; 2378 tmp1 = t[i2]; 2379 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2380 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2381 } 2382 2383 if (n == sz-1){ 2384 tmp0 = t[*idx]; 2385 sum1 -= v1[0] * tmp0; 2386 sum2 -= v2[0] * tmp0; 2387 } 2388 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2389 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2390 ibdiag += 4; row += 2; 2391 break; 2392 case 3: 2393 v2 = a->a + ii[row+1]; 2394 v3 = a->a + ii[row+2]; 2395 sum1 = t[row]; 2396 sum2 = t[row+1]; 2397 sum3 = t[row+2]; 2398 for(n = 0; n<sz-1; n+=2) { 2399 i1 = idx[0]; 2400 i2 = idx[1]; 2401 idx += 2; 2402 tmp0 = t[i1]; 2403 tmp1 = t[i2]; 2404 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2405 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2406 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2407 } 2408 2409 if (n == sz-1){ 2410 tmp0 = t[*idx]; 2411 sum1 -= v1[0] * tmp0; 2412 sum2 -= v2[0] * tmp0; 2413 sum3 -= v3[0] * tmp0; 2414 } 2415 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2416 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2417 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2418 ibdiag += 9; row += 3; 2419 break; 2420 case 4: 2421 v2 = a->a + ii[row+1]; 2422 v3 = a->a + ii[row+2]; 2423 v4 = a->a + ii[row+3]; 2424 sum1 = t[row]; 2425 sum2 = t[row+1]; 2426 sum3 = t[row+2]; 2427 sum4 = t[row+3]; 2428 for(n = 0; n<sz-1; n+=2) { 2429 i1 = idx[0]; 2430 i2 = idx[1]; 2431 idx += 2; 2432 tmp0 = t[i1]; 2433 tmp1 = t[i2]; 2434 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2435 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2436 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2437 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2438 } 2439 2440 if (n == sz-1){ 2441 tmp0 = t[*idx]; 2442 sum1 -= v1[0] * tmp0; 2443 sum2 -= v2[0] * tmp0; 2444 sum3 -= v3[0] * tmp0; 2445 sum4 -= v4[0] * tmp0; 2446 } 2447 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2448 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2449 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2450 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2451 ibdiag += 16; row += 4; 2452 break; 2453 case 5: 2454 v2 = a->a + ii[row+1]; 2455 v3 = a->a + ii[row+2]; 2456 v4 = a->a + ii[row+3]; 2457 v5 = a->a + ii[row+4]; 2458 sum1 = t[row]; 2459 sum2 = t[row+1]; 2460 sum3 = t[row+2]; 2461 sum4 = t[row+3]; 2462 sum5 = t[row+4]; 2463 for(n = 0; n<sz-1; n+=2) { 2464 i1 = idx[0]; 2465 i2 = idx[1]; 2466 idx += 2; 2467 tmp0 = t[i1]; 2468 tmp1 = t[i2]; 2469 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2470 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2471 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2472 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2473 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2474 } 2475 2476 if (n == sz-1){ 2477 tmp0 = t[*idx]; 2478 sum1 -= v1[0] * tmp0; 2479 sum2 -= v2[0] * tmp0; 2480 sum3 -= v3[0] * tmp0; 2481 sum4 -= v4[0] * tmp0; 2482 sum5 -= v5[0] * tmp0; 2483 } 2484 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2485 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2486 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2487 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2488 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2489 ibdiag += 25; row += 5; 2490 break; 2491 default: 2492 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2493 } 2494 CHKMEMQ; 2495 } 2496 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2497 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2498 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2499 } 2500 PetscFunctionReturn(0); 2501 } 2502 2503 #undef __FUNCT__ 2504 #define __FUNCT__ "MatMultDiagonalBlock_Inode" 2505 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx) 2506 { 2507 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2508 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2509 const MatScalar *bdiag = a->inode.bdiag; 2510 const PetscScalar *b; 2511 PetscErrorCode ierr; 2512 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2513 const PetscInt *sizes = a->inode.size; 2514 2515 PetscFunctionBegin; 2516 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2517 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2518 cnt = 0; 2519 for (i=0, row=0; i<m; i++) { 2520 switch (sizes[i]){ 2521 case 1: 2522 x[row] = b[row]*bdiag[cnt++];row++; 2523 break; 2524 case 2: 2525 x1 = b[row]; x2 = b[row+1]; 2526 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2527 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2528 x[row++] = tmp1; 2529 x[row++] = tmp2; 2530 cnt += 4; 2531 break; 2532 case 3: 2533 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2534 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2535 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2536 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2537 x[row++] = tmp1; 2538 x[row++] = tmp2; 2539 x[row++] = tmp3; 2540 cnt += 9; 2541 break; 2542 case 4: 2543 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2544 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2545 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2546 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2547 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2548 x[row++] = tmp1; 2549 x[row++] = tmp2; 2550 x[row++] = tmp3; 2551 x[row++] = tmp4; 2552 cnt += 16; 2553 break; 2554 case 5: 2555 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2556 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2557 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2558 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2559 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2560 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2561 x[row++] = tmp1; 2562 x[row++] = tmp2; 2563 x[row++] = tmp3; 2564 x[row++] = tmp4; 2565 x[row++] = tmp5; 2566 cnt += 25; 2567 break; 2568 default: 2569 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2570 } 2571 } 2572 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2573 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2574 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2575 PetscFunctionReturn(0); 2576 } 2577 2578 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec); 2579 /* 2580 samestructure indicates that the matrix has not changed its nonzero structure so we 2581 do not need to recompute the inodes 2582 */ 2583 #undef __FUNCT__ 2584 #define __FUNCT__ "Mat_CheckInode" 2585 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2586 { 2587 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2588 PetscErrorCode ierr; 2589 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2590 PetscTruth flag; 2591 2592 PetscFunctionBegin; 2593 if (!a->inode.use) PetscFunctionReturn(0); 2594 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2595 2596 2597 m = A->rmap->n; 2598 if (a->inode.size) {ns = a->inode.size;} 2599 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2600 2601 i = 0; 2602 node_count = 0; 2603 idx = a->j; 2604 ii = a->i; 2605 while (i < m){ /* For each row */ 2606 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2607 /* Limits the number of elements in a node to 'a->inode.limit' */ 2608 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2609 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2610 if (nzy != nzx) break; 2611 idy += nzx; /* Same nonzero pattern */ 2612 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2613 if (!flag) break; 2614 } 2615 ns[node_count++] = blk_size; 2616 idx += blk_size*nzx; 2617 i = j; 2618 } 2619 /* If not enough inodes found,, do not use inode version of the routines */ 2620 if (!a->inode.size && m && node_count > .9*m) { 2621 ierr = PetscFree(ns);CHKERRQ(ierr); 2622 a->inode.node_count = 0; 2623 a->inode.size = PETSC_NULL; 2624 a->inode.use = PETSC_FALSE; 2625 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2626 } else { 2627 A->ops->mult = MatMult_Inode; 2628 A->ops->relax = MatRelax_Inode; 2629 A->ops->multadd = MatMultAdd_Inode; 2630 A->ops->getrowij = MatGetRowIJ_Inode; 2631 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2632 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2633 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2634 A->ops->coloringpatch = MatColoringPatch_Inode; 2635 A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode; 2636 a->inode.node_count = node_count; 2637 a->inode.size = ns; 2638 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2639 } 2640 PetscFunctionReturn(0); 2641 } 2642 2643 /* 2644 This is really ugly. if inodes are used this replaces the 2645 permutations with ones that correspond to rows/cols of the matrix 2646 rather then inode blocks 2647 */ 2648 #undef __FUNCT__ 2649 #define __FUNCT__ "MatInodeAdjustForInodes" 2650 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2651 { 2652 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2653 2654 PetscFunctionBegin; 2655 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2656 if (f) { 2657 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2658 } 2659 PetscFunctionReturn(0); 2660 } 2661 2662 EXTERN_C_BEGIN 2663 #undef __FUNCT__ 2664 #define __FUNCT__ "MatAdjustForInodes_Inode" 2665 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2666 { 2667 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2668 PetscErrorCode ierr; 2669 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2670 const PetscInt *ridx,*cidx; 2671 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2672 PetscInt nslim_col,*ns_col; 2673 IS ris = *rperm,cis = *cperm; 2674 2675 PetscFunctionBegin; 2676 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2677 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2678 2679 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2680 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2681 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2682 permc = permr + m; 2683 2684 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2685 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2686 2687 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2688 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2689 2690 /* Construct the permutations for rows*/ 2691 for (i=0,row = 0; i<nslim_row; ++i){ 2692 indx = ridx[i]; 2693 start_val = tns[indx]; 2694 end_val = tns[indx + 1]; 2695 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2696 } 2697 2698 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2699 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2700 2701 /* Construct permutations for columns */ 2702 for (i=0,col=0; i<nslim_col; ++i){ 2703 indx = cidx[i]; 2704 start_val = tns[indx]; 2705 end_val = tns[indx + 1]; 2706 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2707 } 2708 2709 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2710 ISSetPermutation(*rperm); 2711 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2712 ISSetPermutation(*cperm); 2713 2714 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2715 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2716 2717 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2718 ierr = PetscFree(permr);CHKERRQ(ierr); 2719 ierr = ISDestroy(cis);CHKERRQ(ierr); 2720 ierr = ISDestroy(ris);CHKERRQ(ierr); 2721 ierr = PetscFree(tns);CHKERRQ(ierr); 2722 PetscFunctionReturn(0); 2723 } 2724 EXTERN_C_END 2725 2726 #undef __FUNCT__ 2727 #define __FUNCT__ "MatInodeGetInodeSizes" 2728 /*@C 2729 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2730 2731 Collective on Mat 2732 2733 Input Parameter: 2734 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2735 2736 Output Parameter: 2737 + node_count - no of inodes present in the matrix. 2738 . sizes - an array of size node_count,with sizes of each inode. 2739 - limit - the max size used to generate the inodes. 2740 2741 Level: advanced 2742 2743 Notes: This routine returns some internal storage information 2744 of the matrix, it is intended to be used by advanced users. 2745 It should be called after the matrix is assembled. 2746 The contents of the sizes[] array should not be changed. 2747 PETSC_NULL may be passed for information not requested. 2748 2749 .keywords: matrix, seqaij, get, inode 2750 2751 .seealso: MatGetInfo() 2752 @*/ 2753 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2754 { 2755 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2756 2757 PetscFunctionBegin; 2758 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2759 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2760 if (f) { 2761 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2762 } 2763 PetscFunctionReturn(0); 2764 } 2765 2766 EXTERN_C_BEGIN 2767 #undef __FUNCT__ 2768 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2769 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2770 { 2771 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2772 2773 PetscFunctionBegin; 2774 if (node_count) *node_count = a->inode.node_count; 2775 if (sizes) *sizes = a->inode.size; 2776 if (limit) *limit = a->inode.limit; 2777 PetscFunctionReturn(0); 2778 } 2779 EXTERN_C_END 2780