1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 1182 #undef __FUNCT__ 1183 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1184 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1185 { 1186 Mat C = B; 1187 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1188 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1189 PetscErrorCode ierr; 1190 const PetscInt *r,*ic,*c,*ics; 1191 PetscInt n = A->rmap->n,*bi = b->i; 1192 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1193 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1194 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1195 PetscScalar mul1,mul2,mul3,tmp; 1196 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1197 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1198 PetscReal rs=0.0; 1199 FactorShiftCtx sctx; 1200 PetscInt newshift; 1201 1202 PetscFunctionBegin; 1203 sctx.shift_top = 0; 1204 sctx.nshift_max = 0; 1205 sctx.shift_lo = 0; 1206 sctx.shift_hi = 0; 1207 sctx.shift_fraction = 0; 1208 1209 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1210 if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1211 sctx.shift_top = 0; 1212 for (i=0; i<n; i++) { 1213 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1214 rs = 0.0; 1215 ajtmp = aj + ai[i]; 1216 rtmp1 = aa + ai[i]; 1217 nz = ai[i+1] - ai[i]; 1218 for (j=0; j<nz; j++){ 1219 if (*ajtmp != i){ 1220 rs += PetscAbsScalar(*rtmp1++); 1221 } else { 1222 rs -= PetscRealPart(*rtmp1++); 1223 } 1224 ajtmp++; 1225 } 1226 if (rs>sctx.shift_top) sctx.shift_top = rs; 1227 } 1228 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1229 sctx.shift_top *= 1.1; 1230 sctx.nshift_max = 5; 1231 sctx.shift_lo = 0.; 1232 sctx.shift_hi = 1.; 1233 } 1234 sctx.shift_amount = 0; 1235 sctx.nshift = 0; 1236 1237 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1238 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1239 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1240 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1241 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1242 ics = ic ; 1243 rtmp22 = rtmp11 + n; 1244 rtmp33 = rtmp22 + n; 1245 1246 node_max = a->inode.node_count; 1247 ns = a->inode.size; 1248 if (!ns){ 1249 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1250 } 1251 1252 /* If max inode size > 3, split it into two inodes.*/ 1253 /* also map the inode sizes according to the ordering */ 1254 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1255 for (i=0,j=0; i<node_max; ++i,++j){ 1256 if (ns[i]>3) { 1257 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1258 ++j; 1259 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1260 } else { 1261 tmp_vec1[j] = ns[i]; 1262 } 1263 } 1264 /* Use the correct node_max */ 1265 node_max = j; 1266 1267 /* Now reorder the inode info based on mat re-ordering info */ 1268 /* First create a row -> inode_size_array_index map */ 1269 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1270 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1271 for (i=0,row=0; i<node_max; i++) { 1272 nodesz = tmp_vec1[i]; 1273 for (j=0; j<nodesz; j++,row++) { 1274 nsmap[row] = i; 1275 } 1276 } 1277 /* Using nsmap, create a reordered ns structure */ 1278 for (i=0,j=0; i< node_max; i++) { 1279 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1280 tmp_vec2[i] = nodesz; 1281 j += nodesz; 1282 } 1283 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1284 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1285 /* Now use the correct ns */ 1286 ns = tmp_vec2; 1287 1288 do { 1289 sctx.useshift = PETSC_FALSE; 1290 /* Now loop over each block-row, and do the factorization */ 1291 for (i=0,row=0; i<node_max; i++) { 1292 nodesz = ns[i]; 1293 nz = bi[row+1] - bi[row]; 1294 bjtmp = bj + bi[row]; 1295 1296 switch (nodesz){ 1297 case 1: 1298 for (j=0; j<nz; j++){ 1299 idx = bjtmp[j]; 1300 rtmp11[idx] = 0.0; 1301 } 1302 1303 /* load in initial (unfactored row) */ 1304 idx = r[row]; 1305 nz_tmp = ai[idx+1] - ai[idx]; 1306 ajtmp = aj + ai[idx]; 1307 v1 = aa + ai[idx]; 1308 1309 for (j=0; j<nz_tmp; j++) { 1310 idx = ics[ajtmp[j]]; 1311 rtmp11[idx] = v1[j]; 1312 } 1313 rtmp11[ics[r[row]]] += sctx.shift_amount; 1314 1315 prow = *bjtmp++ ; 1316 while (prow < row) { 1317 pc1 = rtmp11 + prow; 1318 if (*pc1 != 0.0){ 1319 pv = ba + bd[prow]; 1320 pj = nbj + bd[prow]; 1321 mul1 = *pc1 * *pv++; 1322 *pc1 = mul1; 1323 nz_tmp = bi[prow+1] - bd[prow] - 1; 1324 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1325 for (j=0; j<nz_tmp; j++) { 1326 tmp = pv[j]; 1327 idx = pj[j]; 1328 rtmp11[idx] -= mul1 * tmp; 1329 } 1330 } 1331 prow = *bjtmp++ ; 1332 } 1333 pj = bj + bi[row]; 1334 pc1 = ba + bi[row]; 1335 1336 sctx.pv = rtmp11[row]; 1337 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1338 rs = 0.0; 1339 for (j=0; j<nz; j++) { 1340 idx = pj[j]; 1341 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1342 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1343 } 1344 sctx.rs = rs; 1345 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1346 if (newshift == 1) goto endofwhile; 1347 break; 1348 1349 case 2: 1350 for (j=0; j<nz; j++) { 1351 idx = bjtmp[j]; 1352 rtmp11[idx] = 0.0; 1353 rtmp22[idx] = 0.0; 1354 } 1355 1356 /* load in initial (unfactored row) */ 1357 idx = r[row]; 1358 nz_tmp = ai[idx+1] - ai[idx]; 1359 ajtmp = aj + ai[idx]; 1360 v1 = aa + ai[idx]; 1361 v2 = aa + ai[idx+1]; 1362 for (j=0; j<nz_tmp; j++) { 1363 idx = ics[ajtmp[j]]; 1364 rtmp11[idx] = v1[j]; 1365 rtmp22[idx] = v2[j]; 1366 } 1367 rtmp11[ics[r[row]]] += sctx.shift_amount; 1368 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1369 1370 prow = *bjtmp++ ; 1371 while (prow < row) { 1372 pc1 = rtmp11 + prow; 1373 pc2 = rtmp22 + prow; 1374 if (*pc1 != 0.0 || *pc2 != 0.0){ 1375 pv = ba + bd[prow]; 1376 pj = nbj + bd[prow]; 1377 mul1 = *pc1 * *pv; 1378 mul2 = *pc2 * *pv; 1379 ++pv; 1380 *pc1 = mul1; 1381 *pc2 = mul2; 1382 1383 nz_tmp = bi[prow+1] - bd[prow] - 1; 1384 for (j=0; j<nz_tmp; j++) { 1385 tmp = pv[j]; 1386 idx = pj[j]; 1387 rtmp11[idx] -= mul1 * tmp; 1388 rtmp22[idx] -= mul2 * tmp; 1389 } 1390 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1391 } 1392 prow = *bjtmp++ ; 1393 } 1394 1395 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1396 pc1 = rtmp11 + prow; 1397 pc2 = rtmp22 + prow; 1398 1399 sctx.pv = *pc1; 1400 pj = bj + bi[prow]; 1401 rs = 0.0; 1402 for (j=0; j<nz; j++){ 1403 idx = pj[j]; 1404 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1405 } 1406 sctx.rs = rs; 1407 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1408 if (newshift == 1) goto endofwhile; 1409 1410 if (*pc2 != 0.0){ 1411 pj = nbj + bd[prow]; 1412 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1413 *pc2 = mul2; 1414 nz_tmp = bi[prow+1] - bd[prow] - 1; 1415 for (j=0; j<nz_tmp; j++) { 1416 idx = pj[j] ; 1417 tmp = rtmp11[idx]; 1418 rtmp22[idx] -= mul2 * tmp; 1419 } 1420 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1421 } 1422 1423 pj = bj + bi[row]; 1424 pc1 = ba + bi[row]; 1425 pc2 = ba + bi[row+1]; 1426 1427 sctx.pv = rtmp22[row+1]; 1428 rs = 0.0; 1429 rtmp11[row] = 1.0/rtmp11[row]; 1430 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1431 /* copy row entries from dense representation to sparse */ 1432 for (j=0; j<nz; j++) { 1433 idx = pj[j]; 1434 pc1[j] = rtmp11[idx]; 1435 pc2[j] = rtmp22[idx]; 1436 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1437 } 1438 sctx.rs = rs; 1439 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1440 if (newshift == 1) goto endofwhile; 1441 break; 1442 1443 case 3: 1444 for (j=0; j<nz; j++) { 1445 idx = bjtmp[j]; 1446 rtmp11[idx] = 0.0; 1447 rtmp22[idx] = 0.0; 1448 rtmp33[idx] = 0.0; 1449 } 1450 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1451 idx = r[row]; 1452 nz_tmp = ai[idx+1] - ai[idx]; 1453 ajtmp = aj + ai[idx]; 1454 v1 = aa + ai[idx]; 1455 v2 = aa + ai[idx+1]; 1456 v3 = aa + ai[idx+2]; 1457 for (j=0; j<nz_tmp; j++) { 1458 idx = ics[ajtmp[j]]; 1459 rtmp11[idx] = v1[j]; 1460 rtmp22[idx] = v2[j]; 1461 rtmp33[idx] = v3[j]; 1462 } 1463 rtmp11[ics[r[row]]] += sctx.shift_amount; 1464 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1465 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1466 1467 /* loop over all pivot row blocks above this row block */ 1468 prow = *bjtmp++ ; 1469 while (prow < row) { 1470 pc1 = rtmp11 + prow; 1471 pc2 = rtmp22 + prow; 1472 pc3 = rtmp33 + prow; 1473 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1474 pv = ba + bd[prow]; 1475 pj = nbj + bd[prow]; 1476 mul1 = *pc1 * *pv; 1477 mul2 = *pc2 * *pv; 1478 mul3 = *pc3 * *pv; 1479 ++pv; 1480 *pc1 = mul1; 1481 *pc2 = mul2; 1482 *pc3 = mul3; 1483 1484 nz_tmp = bi[prow+1] - bd[prow] - 1; 1485 /* update this row based on pivot row */ 1486 for (j=0; j<nz_tmp; j++) { 1487 tmp = pv[j]; 1488 idx = pj[j]; 1489 rtmp11[idx] -= mul1 * tmp; 1490 rtmp22[idx] -= mul2 * tmp; 1491 rtmp33[idx] -= mul3 * tmp; 1492 } 1493 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1494 } 1495 prow = *bjtmp++ ; 1496 } 1497 1498 /* Now take care of diagonal 3x3 block in this set of rows */ 1499 /* note: prow = row here */ 1500 pc1 = rtmp11 + prow; 1501 pc2 = rtmp22 + prow; 1502 pc3 = rtmp33 + prow; 1503 1504 sctx.pv = *pc1; 1505 pj = bj + bi[prow]; 1506 rs = 0.0; 1507 for (j=0; j<nz; j++){ 1508 idx = pj[j]; 1509 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1510 } 1511 sctx.rs = rs; 1512 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1513 if (newshift == 1) goto endofwhile; 1514 1515 if (*pc2 != 0.0 || *pc3 != 0.0){ 1516 mul2 = (*pc2)/(*pc1); 1517 mul3 = (*pc3)/(*pc1); 1518 *pc2 = mul2; 1519 *pc3 = mul3; 1520 nz_tmp = bi[prow+1] - bd[prow] - 1; 1521 pj = nbj + bd[prow]; 1522 for (j=0; j<nz_tmp; j++) { 1523 idx = pj[j] ; 1524 tmp = rtmp11[idx]; 1525 rtmp22[idx] -= mul2 * tmp; 1526 rtmp33[idx] -= mul3 * tmp; 1527 } 1528 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1529 } 1530 ++prow; 1531 1532 pc2 = rtmp22 + prow; 1533 pc3 = rtmp33 + prow; 1534 sctx.pv = *pc2; 1535 pj = bj + bi[prow]; 1536 rs = 0.0; 1537 for (j=0; j<nz; j++){ 1538 idx = pj[j]; 1539 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1540 } 1541 sctx.rs = rs; 1542 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1543 if (newshift == 1) goto endofwhile; 1544 1545 if (*pc3 != 0.0){ 1546 mul3 = (*pc3)/(*pc2); 1547 *pc3 = mul3; 1548 pj = nbj + bd[prow]; 1549 nz_tmp = bi[prow+1] - bd[prow] - 1; 1550 for (j=0; j<nz_tmp; j++) { 1551 idx = pj[j] ; 1552 tmp = rtmp22[idx]; 1553 rtmp33[idx] -= mul3 * tmp; 1554 } 1555 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1556 } 1557 1558 pj = bj + bi[row]; 1559 pc1 = ba + bi[row]; 1560 pc2 = ba + bi[row+1]; 1561 pc3 = ba + bi[row+2]; 1562 1563 sctx.pv = rtmp33[row+2]; 1564 rs = 0.0; 1565 rtmp11[row] = 1.0/rtmp11[row]; 1566 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1567 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1568 /* copy row entries from dense representation to sparse */ 1569 for (j=0; j<nz; j++) { 1570 idx = pj[j]; 1571 pc1[j] = rtmp11[idx]; 1572 pc2[j] = rtmp22[idx]; 1573 pc3[j] = rtmp33[idx]; 1574 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1575 } 1576 1577 sctx.rs = rs; 1578 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1579 if (newshift == 1) goto endofwhile; 1580 break; 1581 1582 default: 1583 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1584 } 1585 row += nodesz; /* Update the row */ 1586 } 1587 endofwhile:; 1588 } while (sctx.useshift); 1589 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1590 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1591 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1592 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1593 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1594 (B)->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 1595 /* do not set solve add, since MatSolve_Inode + Add is faster */ 1596 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 1597 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 1598 C->assembled = PETSC_TRUE; 1599 C->preallocated = PETSC_TRUE; 1600 if (sctx.nshift) { 1601 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1602 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1603 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1604 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1605 } 1606 } 1607 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1608 PetscFunctionReturn(0); 1609 } 1610 1611 1612 /* ----------------------------------------------------------- */ 1613 #undef __FUNCT__ 1614 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 1615 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 1616 { 1617 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1618 IS iscol = a->col,isrow = a->row; 1619 PetscErrorCode ierr; 1620 const PetscInt *r,*c,*rout,*cout; 1621 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 1622 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 1623 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 1624 PetscScalar sum1,sum2,sum3,sum4,sum5; 1625 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 1626 const PetscScalar *b; 1627 1628 PetscFunctionBegin; 1629 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 1630 node_max = a->inode.node_count; 1631 ns = a->inode.size; /* Node Size array */ 1632 1633 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1634 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1635 tmp = a->solve_work; 1636 1637 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1638 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1639 1640 /* forward solve the lower triangular */ 1641 tmps = tmp ; 1642 aa = a_a ; 1643 aj = a_j ; 1644 ad = a->diag; 1645 1646 for (i = 0,row = 0; i< node_max; ++i){ 1647 nsz = ns[i]; 1648 aii = ai[row]; 1649 v1 = aa + aii; 1650 vi = aj + aii; 1651 nz = ai[row+1]- ai[row]; 1652 1653 if (i < node_max-1) { 1654 /* Prefetch the indices for the next block */ 1655 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 1656 /* Prefetch the data for the next block */ 1657 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 1658 } 1659 1660 switch (nsz){ /* Each loop in 'case' is unrolled */ 1661 case 1 : 1662 sum1 = b[r[row]]; 1663 for(j=0; j<nz-1; j+=2){ 1664 i0 = vi[j]; 1665 i1 = vi[j+1]; 1666 tmp0 = tmps[i0]; 1667 tmp1 = tmps[i1]; 1668 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 1669 } 1670 if(j == nz-1){ 1671 tmp0 = tmps[vi[j]]; 1672 sum1 -= v1[j]*tmp0; 1673 } 1674 tmp[row++]=sum1; 1675 break; 1676 case 2: 1677 sum1 = b[r[row]]; 1678 sum2 = b[r[row+1]]; 1679 v2 = aa + ai[row+1]; 1680 1681 for(j=0; j<nz-1; j+=2){ 1682 i0 = vi[j]; 1683 i1 = vi[j+1]; 1684 tmp0 = tmps[i0]; 1685 tmp1 = tmps[i1]; 1686 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1687 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1688 } 1689 if(j == nz-1){ 1690 tmp0 = tmps[vi[j]]; 1691 sum1 -= v1[j] *tmp0; 1692 sum2 -= v2[j] *tmp0; 1693 } 1694 sum2 -= v2[nz] * sum1; 1695 tmp[row ++]=sum1; 1696 tmp[row ++]=sum2; 1697 break; 1698 case 3: 1699 sum1 = b[r[row]]; 1700 sum2 = b[r[row+1]]; 1701 sum3 = b[r[row+2]]; 1702 v2 = aa + ai[row+1]; 1703 v3 = aa + ai[row+2]; 1704 1705 for (j=0; j<nz-1; j+=2){ 1706 i0 = vi[j]; 1707 i1 = vi[j+1]; 1708 tmp0 = tmps[i0]; 1709 tmp1 = tmps[i1]; 1710 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1711 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1712 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1713 } 1714 if (j == nz-1){ 1715 tmp0 = tmps[vi[j]]; 1716 sum1 -= v1[j] *tmp0; 1717 sum2 -= v2[j] *tmp0; 1718 sum3 -= v3[j] *tmp0; 1719 } 1720 sum2 -= v2[nz] * sum1; 1721 sum3 -= v3[nz] * sum1; 1722 sum3 -= v3[nz+1] * sum2; 1723 tmp[row ++]=sum1; 1724 tmp[row ++]=sum2; 1725 tmp[row ++]=sum3; 1726 break; 1727 1728 case 4: 1729 sum1 = b[r[row]]; 1730 sum2 = b[r[row+1]]; 1731 sum3 = b[r[row+2]]; 1732 sum4 = b[r[row+3]]; 1733 v2 = aa + ai[row+1]; 1734 v3 = aa + ai[row+2]; 1735 v4 = aa + ai[row+3]; 1736 1737 for (j=0; j<nz-1; j+=2){ 1738 i0 = vi[j]; 1739 i1 = vi[j+1]; 1740 tmp0 = tmps[i0]; 1741 tmp1 = tmps[i1]; 1742 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1743 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1744 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1745 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1746 } 1747 if (j == nz-1){ 1748 tmp0 = tmps[vi[j]]; 1749 sum1 -= v1[j] *tmp0; 1750 sum2 -= v2[j] *tmp0; 1751 sum3 -= v3[j] *tmp0; 1752 sum4 -= v4[j] *tmp0; 1753 } 1754 sum2 -= v2[nz] * sum1; 1755 sum3 -= v3[nz] * sum1; 1756 sum4 -= v4[nz] * sum1; 1757 sum3 -= v3[nz+1] * sum2; 1758 sum4 -= v4[nz+1] * sum2; 1759 sum4 -= v4[nz+2] * sum3; 1760 1761 tmp[row ++]=sum1; 1762 tmp[row ++]=sum2; 1763 tmp[row ++]=sum3; 1764 tmp[row ++]=sum4; 1765 break; 1766 case 5: 1767 sum1 = b[r[row]]; 1768 sum2 = b[r[row+1]]; 1769 sum3 = b[r[row+2]]; 1770 sum4 = b[r[row+3]]; 1771 sum5 = b[r[row+4]]; 1772 v2 = aa + ai[row+1]; 1773 v3 = aa + ai[row+2]; 1774 v4 = aa + ai[row+3]; 1775 v5 = aa + ai[row+4]; 1776 1777 for (j=0; j<nz-1; j+=2){ 1778 i0 = vi[j]; 1779 i1 = vi[j+1]; 1780 tmp0 = tmps[i0]; 1781 tmp1 = tmps[i1]; 1782 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1783 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1784 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1785 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1786 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 1787 } 1788 if (j == nz-1){ 1789 tmp0 = tmps[vi[j]]; 1790 sum1 -= v1[j] *tmp0; 1791 sum2 -= v2[j] *tmp0; 1792 sum3 -= v3[j] *tmp0; 1793 sum4 -= v4[j] *tmp0; 1794 sum5 -= v5[j] *tmp0; 1795 } 1796 1797 sum2 -= v2[nz] * sum1; 1798 sum3 -= v3[nz] * sum1; 1799 sum4 -= v4[nz] * sum1; 1800 sum5 -= v5[nz] * sum1; 1801 sum3 -= v3[nz+1] * sum2; 1802 sum4 -= v4[nz+1] * sum2; 1803 sum5 -= v5[nz+1] * sum2; 1804 sum4 -= v4[nz+2] * sum3; 1805 sum5 -= v5[nz+2] * sum3; 1806 sum5 -= v5[nz+3] * sum4; 1807 1808 tmp[row ++]=sum1; 1809 tmp[row ++]=sum2; 1810 tmp[row ++]=sum3; 1811 tmp[row ++]=sum4; 1812 tmp[row ++]=sum5; 1813 break; 1814 default: 1815 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1816 } 1817 } 1818 /* backward solve the upper triangular */ 1819 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 1820 nsz = ns[i]; 1821 aii = ad[row+1] + 1; 1822 v1 = aa + aii; 1823 vi = aj + aii; 1824 nz = ad[row]- ad[row+1] - 1; 1825 1826 if (i > 0) { 1827 /* Prefetch the indices for the next block */ 1828 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 1829 /* Prefetch the data for the next block */ 1830 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 1831 } 1832 1833 switch (nsz){ /* Each loop in 'case' is unrolled */ 1834 case 1 : 1835 sum1 = tmp[row]; 1836 1837 for(j=0 ; j<nz-1; j+=2){ 1838 i0 = vi[j]; 1839 i1 = vi[j+1]; 1840 tmp0 = tmps[i0]; 1841 tmp1 = tmps[i1]; 1842 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1843 } 1844 if (j == nz-1){ 1845 tmp0 = tmps[vi[j]]; 1846 sum1 -= v1[j]*tmp0; 1847 } 1848 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1849 break; 1850 case 2 : 1851 sum1 = tmp[row]; 1852 sum2 = tmp[row-1]; 1853 v2 = aa + ad[row] + 1; 1854 for (j=0 ; j<nz-1; j+=2){ 1855 i0 = vi[j]; 1856 i1 = vi[j+1]; 1857 tmp0 = tmps[i0]; 1858 tmp1 = tmps[i1]; 1859 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1860 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1861 } 1862 if (j == nz-1){ 1863 tmp0 = tmps[vi[j]]; 1864 sum1 -= v1[j]* tmp0; 1865 sum2 -= v2[j+1]* tmp0; 1866 } 1867 1868 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1869 sum2 -= v2[0] * tmp0; 1870 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1871 break; 1872 case 3 : 1873 sum1 = tmp[row]; 1874 sum2 = tmp[row -1]; 1875 sum3 = tmp[row -2]; 1876 v2 = aa + ad[row] + 1; 1877 v3 = aa + ad[row -1] + 1; 1878 for (j=0 ; j<nz-1; j+=2){ 1879 i0 = vi[j]; 1880 i1 = vi[j+1]; 1881 tmp0 = tmps[i0]; 1882 tmp1 = tmps[i1]; 1883 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1884 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1885 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1886 } 1887 if (j== nz-1){ 1888 tmp0 = tmps[vi[j]]; 1889 sum1 -= v1[j] * tmp0; 1890 sum2 -= v2[j+1] * tmp0; 1891 sum3 -= v3[j+2] * tmp0; 1892 } 1893 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1894 sum2 -= v2[0]* tmp0; 1895 sum3 -= v3[1] * tmp0; 1896 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1897 sum3 -= v3[0]* tmp0; 1898 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1899 1900 break; 1901 case 4 : 1902 sum1 = tmp[row]; 1903 sum2 = tmp[row -1]; 1904 sum3 = tmp[row -2]; 1905 sum4 = tmp[row -3]; 1906 v2 = aa + ad[row]+1; 1907 v3 = aa + ad[row -1]+1; 1908 v4 = aa + ad[row -2]+1; 1909 1910 for (j=0 ; j<nz-1; j+=2){ 1911 i0 = vi[j]; 1912 i1 = vi[j+1]; 1913 tmp0 = tmps[i0]; 1914 tmp1 = tmps[i1]; 1915 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1916 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1917 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1918 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1919 } 1920 if (j== nz-1){ 1921 tmp0 = tmps[vi[j]]; 1922 sum1 -= v1[j] * tmp0; 1923 sum2 -= v2[j+1] * tmp0; 1924 sum3 -= v3[j+2] * tmp0; 1925 sum4 -= v4[j+3] * tmp0; 1926 } 1927 1928 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1929 sum2 -= v2[0] * tmp0; 1930 sum3 -= v3[1] * tmp0; 1931 sum4 -= v4[2] * tmp0; 1932 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1933 sum3 -= v3[0] * tmp0; 1934 sum4 -= v4[1] * tmp0; 1935 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1936 sum4 -= v4[0] * tmp0; 1937 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1938 break; 1939 case 5 : 1940 sum1 = tmp[row]; 1941 sum2 = tmp[row -1]; 1942 sum3 = tmp[row -2]; 1943 sum4 = tmp[row -3]; 1944 sum5 = tmp[row -4]; 1945 v2 = aa + ad[row]+1; 1946 v3 = aa + ad[row -1]+1; 1947 v4 = aa + ad[row -2]+1; 1948 v5 = aa + ad[row -3]+1; 1949 for (j=0 ; j<nz-1; j+=2){ 1950 i0 = vi[j]; 1951 i1 = vi[j+1]; 1952 tmp0 = tmps[i0]; 1953 tmp1 = tmps[i1]; 1954 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1955 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1956 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1957 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1958 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 1959 } 1960 if (j==nz-1){ 1961 tmp0 = tmps[vi[j]]; 1962 sum1 -= v1[j] * tmp0; 1963 sum2 -= v2[j+1] * tmp0; 1964 sum3 -= v3[j+2] * tmp0; 1965 sum4 -= v4[j+3] * tmp0; 1966 sum5 -= v5[j+4] * tmp0; 1967 } 1968 1969 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1970 sum2 -= v2[0] * tmp0; 1971 sum3 -= v3[1] * tmp0; 1972 sum4 -= v4[2] * tmp0; 1973 sum5 -= v5[3] * tmp0; 1974 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1975 sum3 -= v3[0] * tmp0; 1976 sum4 -= v4[1] * tmp0; 1977 sum5 -= v5[2] * tmp0; 1978 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1979 sum4 -= v4[0] * tmp0; 1980 sum5 -= v5[1] * tmp0; 1981 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1982 sum5 -= v5[0] * tmp0; 1983 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 1984 break; 1985 default: 1986 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1987 } 1988 } 1989 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1990 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1991 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1993 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1994 PetscFunctionReturn(0); 1995 } 1996 1997 1998 /* 1999 Makes a longer coloring[] array and calls the usual code with that 2000 */ 2001 #undef __FUNCT__ 2002 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2003 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2004 { 2005 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2006 PetscErrorCode ierr; 2007 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2008 PetscInt *colorused,i; 2009 ISColoringValue *newcolor; 2010 2011 PetscFunctionBegin; 2012 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2013 /* loop over inodes, marking a color for each column*/ 2014 row = 0; 2015 for (i=0; i<m; i++){ 2016 for (j=0; j<ns[i]; j++) { 2017 newcolor[row++] = coloring[i] + j*ncolors; 2018 } 2019 } 2020 2021 /* eliminate unneeded colors */ 2022 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2023 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2024 for (i=0; i<n; i++) { 2025 colorused[newcolor[i]] = 1; 2026 } 2027 2028 for (i=1; i<5*ncolors; i++) { 2029 colorused[i] += colorused[i-1]; 2030 } 2031 ncolors = colorused[5*ncolors-1]; 2032 for (i=0; i<n; i++) { 2033 newcolor[i] = colorused[newcolor[i]]-1; 2034 } 2035 ierr = PetscFree(colorused);CHKERRQ(ierr); 2036 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2037 ierr = PetscFree(coloring);CHKERRQ(ierr); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 #include "../src/mat/blockinvert.h" 2042 2043 #undef __FUNCT__ 2044 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2045 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2046 { 2047 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2048 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2049 MatScalar *ibdiag,*bdiag,work[25]; 2050 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2051 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2052 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2053 PetscErrorCode ierr; 2054 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2055 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2056 2057 PetscFunctionBegin; 2058 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2059 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2060 if (its > 1) { 2061 /* switch to non-inode version */ 2062 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2063 PetscFunctionReturn(0); 2064 } 2065 2066 if (!a->inode.ibdiagvalid) { 2067 if (!a->inode.ibdiag) { 2068 /* calculate space needed for diagonal blocks */ 2069 for (i=0; i<m; i++) { 2070 cnt += sizes[i]*sizes[i]; 2071 } 2072 a->inode.bdiagsize = cnt; 2073 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2074 } 2075 2076 /* copy over the diagonal blocks and invert them */ 2077 ibdiag = a->inode.ibdiag; 2078 bdiag = a->inode.bdiag; 2079 cnt = 0; 2080 for (i=0, row = 0; i<m; i++) { 2081 for (j=0; j<sizes[i]; j++) { 2082 for (k=0; k<sizes[i]; k++) { 2083 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2084 } 2085 } 2086 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2087 2088 switch(sizes[i]) { 2089 case 1: 2090 /* Create matrix data structure */ 2091 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2092 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2093 break; 2094 case 2: 2095 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2096 break; 2097 case 3: 2098 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2099 break; 2100 case 4: 2101 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2102 break; 2103 case 5: 2104 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2105 break; 2106 default: 2107 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2108 } 2109 cnt += sizes[i]*sizes[i]; 2110 row += sizes[i]; 2111 } 2112 a->inode.ibdiagvalid = PETSC_TRUE; 2113 } 2114 ibdiag = a->inode.ibdiag; 2115 bdiag = a->inode.bdiag; 2116 2117 2118 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2119 if (flag & SOR_ZERO_INITIAL_GUESS) { 2120 PetscScalar *b; 2121 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2122 if (xx != bb) { 2123 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2124 } else { 2125 b = x; 2126 } 2127 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2128 2129 for (i=0, row=0; i<m; i++) { 2130 sz = diag[row] - ii[row]; 2131 v1 = a->a + ii[row]; 2132 idx = a->j + ii[row]; 2133 2134 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2135 switch (sizes[i]){ 2136 case 1: 2137 2138 sum1 = b[row]; 2139 for(n = 0; n<sz-1; n+=2) { 2140 i1 = idx[0]; 2141 i2 = idx[1]; 2142 idx += 2; 2143 tmp0 = x[i1]; 2144 tmp1 = x[i2]; 2145 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2146 } 2147 2148 if (n == sz-1){ 2149 tmp0 = x[*idx]; 2150 sum1 -= *v1 * tmp0; 2151 } 2152 x[row++] = sum1*(*ibdiag++); 2153 break; 2154 case 2: 2155 v2 = a->a + ii[row+1]; 2156 sum1 = b[row]; 2157 sum2 = b[row+1]; 2158 for(n = 0; n<sz-1; n+=2) { 2159 i1 = idx[0]; 2160 i2 = idx[1]; 2161 idx += 2; 2162 tmp0 = x[i1]; 2163 tmp1 = x[i2]; 2164 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2165 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2166 } 2167 2168 if (n == sz-1){ 2169 tmp0 = x[*idx]; 2170 sum1 -= v1[0] * tmp0; 2171 sum2 -= v2[0] * tmp0; 2172 } 2173 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2174 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2175 ibdiag += 4; 2176 break; 2177 case 3: 2178 v2 = a->a + ii[row+1]; 2179 v3 = a->a + ii[row+2]; 2180 sum1 = b[row]; 2181 sum2 = b[row+1]; 2182 sum3 = b[row+2]; 2183 for(n = 0; n<sz-1; n+=2) { 2184 i1 = idx[0]; 2185 i2 = idx[1]; 2186 idx += 2; 2187 tmp0 = x[i1]; 2188 tmp1 = x[i2]; 2189 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2190 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2191 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2192 } 2193 2194 if (n == sz-1){ 2195 tmp0 = x[*idx]; 2196 sum1 -= v1[0] * tmp0; 2197 sum2 -= v2[0] * tmp0; 2198 sum3 -= v3[0] * tmp0; 2199 } 2200 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2201 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2202 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2203 ibdiag += 9; 2204 break; 2205 case 4: 2206 v2 = a->a + ii[row+1]; 2207 v3 = a->a + ii[row+2]; 2208 v4 = a->a + ii[row+3]; 2209 sum1 = b[row]; 2210 sum2 = b[row+1]; 2211 sum3 = b[row+2]; 2212 sum4 = b[row+3]; 2213 for(n = 0; n<sz-1; n+=2) { 2214 i1 = idx[0]; 2215 i2 = idx[1]; 2216 idx += 2; 2217 tmp0 = x[i1]; 2218 tmp1 = x[i2]; 2219 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2220 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2221 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2222 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2223 } 2224 2225 if (n == sz-1){ 2226 tmp0 = x[*idx]; 2227 sum1 -= v1[0] * tmp0; 2228 sum2 -= v2[0] * tmp0; 2229 sum3 -= v3[0] * tmp0; 2230 sum4 -= v4[0] * tmp0; 2231 } 2232 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2233 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2234 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2235 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2236 ibdiag += 16; 2237 break; 2238 case 5: 2239 v2 = a->a + ii[row+1]; 2240 v3 = a->a + ii[row+2]; 2241 v4 = a->a + ii[row+3]; 2242 v5 = a->a + ii[row+4]; 2243 sum1 = b[row]; 2244 sum2 = b[row+1]; 2245 sum3 = b[row+2]; 2246 sum4 = b[row+3]; 2247 sum5 = b[row+4]; 2248 for(n = 0; n<sz-1; n+=2) { 2249 i1 = idx[0]; 2250 i2 = idx[1]; 2251 idx += 2; 2252 tmp0 = x[i1]; 2253 tmp1 = x[i2]; 2254 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2255 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2256 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2257 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2258 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2259 } 2260 2261 if (n == sz-1){ 2262 tmp0 = x[*idx]; 2263 sum1 -= v1[0] * tmp0; 2264 sum2 -= v2[0] * tmp0; 2265 sum3 -= v3[0] * tmp0; 2266 sum4 -= v4[0] * tmp0; 2267 sum5 -= v5[0] * tmp0; 2268 } 2269 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2270 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2271 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2272 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2273 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2274 ibdiag += 25; 2275 break; 2276 default: 2277 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2278 } 2279 } 2280 2281 xb = x; 2282 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2283 } else xb = b; 2284 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2285 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2286 cnt = 0; 2287 for (i=0, row=0; i<m; i++) { 2288 2289 switch (sizes[i]){ 2290 case 1: 2291 x[row++] *= bdiag[cnt++]; 2292 break; 2293 case 2: 2294 x1 = x[row]; x2 = x[row+1]; 2295 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2296 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2297 x[row++] = tmp1; 2298 x[row++] = tmp2; 2299 cnt += 4; 2300 break; 2301 case 3: 2302 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2303 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2304 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2305 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2306 x[row++] = tmp1; 2307 x[row++] = tmp2; 2308 x[row++] = tmp3; 2309 cnt += 9; 2310 break; 2311 case 4: 2312 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2313 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2314 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2315 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2316 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2317 x[row++] = tmp1; 2318 x[row++] = tmp2; 2319 x[row++] = tmp3; 2320 x[row++] = tmp4; 2321 cnt += 16; 2322 break; 2323 case 5: 2324 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2325 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2326 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2327 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2328 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2329 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2330 x[row++] = tmp1; 2331 x[row++] = tmp2; 2332 x[row++] = tmp3; 2333 x[row++] = tmp4; 2334 x[row++] = tmp5; 2335 cnt += 25; 2336 break; 2337 default: 2338 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2339 } 2340 } 2341 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2342 } 2343 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2344 2345 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2346 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2347 ibdiag -= sizes[i]*sizes[i]; 2348 sz = ii[row+1] - diag[row] - 1; 2349 v1 = a->a + diag[row] + 1; 2350 idx = a->j + diag[row] + 1; 2351 2352 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2353 switch (sizes[i]){ 2354 case 1: 2355 2356 sum1 = xb[row]; 2357 for(n = 0; n<sz-1; n+=2) { 2358 i1 = idx[0]; 2359 i2 = idx[1]; 2360 idx += 2; 2361 tmp0 = x[i1]; 2362 tmp1 = x[i2]; 2363 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2364 } 2365 2366 if (n == sz-1){ 2367 tmp0 = x[*idx]; 2368 sum1 -= *v1*tmp0; 2369 } 2370 x[row--] = sum1*(*ibdiag); 2371 break; 2372 2373 case 2: 2374 2375 sum1 = xb[row]; 2376 sum2 = xb[row-1]; 2377 /* note that sum1 is associated with the second of the two rows */ 2378 v2 = a->a + diag[row-1] + 2; 2379 for(n = 0; n<sz-1; n+=2) { 2380 i1 = idx[0]; 2381 i2 = idx[1]; 2382 idx += 2; 2383 tmp0 = x[i1]; 2384 tmp1 = x[i2]; 2385 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2386 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2387 } 2388 2389 if (n == sz-1){ 2390 tmp0 = x[*idx]; 2391 sum1 -= *v1*tmp0; 2392 sum2 -= *v2*tmp0; 2393 } 2394 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2395 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2396 break; 2397 case 3: 2398 2399 sum1 = xb[row]; 2400 sum2 = xb[row-1]; 2401 sum3 = xb[row-2]; 2402 v2 = a->a + diag[row-1] + 2; 2403 v3 = a->a + diag[row-2] + 3; 2404 for(n = 0; n<sz-1; n+=2) { 2405 i1 = idx[0]; 2406 i2 = idx[1]; 2407 idx += 2; 2408 tmp0 = x[i1]; 2409 tmp1 = x[i2]; 2410 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2411 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2412 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2413 } 2414 2415 if (n == sz-1){ 2416 tmp0 = x[*idx]; 2417 sum1 -= *v1*tmp0; 2418 sum2 -= *v2*tmp0; 2419 sum3 -= *v3*tmp0; 2420 } 2421 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2422 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2423 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2424 break; 2425 case 4: 2426 2427 sum1 = xb[row]; 2428 sum2 = xb[row-1]; 2429 sum3 = xb[row-2]; 2430 sum4 = xb[row-3]; 2431 v2 = a->a + diag[row-1] + 2; 2432 v3 = a->a + diag[row-2] + 3; 2433 v4 = a->a + diag[row-3] + 4; 2434 for(n = 0; n<sz-1; n+=2) { 2435 i1 = idx[0]; 2436 i2 = idx[1]; 2437 idx += 2; 2438 tmp0 = x[i1]; 2439 tmp1 = x[i2]; 2440 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2441 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2442 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2443 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2444 } 2445 2446 if (n == sz-1){ 2447 tmp0 = x[*idx]; 2448 sum1 -= *v1*tmp0; 2449 sum2 -= *v2*tmp0; 2450 sum3 -= *v3*tmp0; 2451 sum4 -= *v4*tmp0; 2452 } 2453 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2454 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2455 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2456 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2457 break; 2458 case 5: 2459 2460 sum1 = xb[row]; 2461 sum2 = xb[row-1]; 2462 sum3 = xb[row-2]; 2463 sum4 = xb[row-3]; 2464 sum5 = xb[row-4]; 2465 v2 = a->a + diag[row-1] + 2; 2466 v3 = a->a + diag[row-2] + 3; 2467 v4 = a->a + diag[row-3] + 4; 2468 v5 = a->a + diag[row-4] + 5; 2469 for(n = 0; n<sz-1; n+=2) { 2470 i1 = idx[0]; 2471 i2 = idx[1]; 2472 idx += 2; 2473 tmp0 = x[i1]; 2474 tmp1 = x[i2]; 2475 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2476 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2477 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2478 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2479 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2480 } 2481 2482 if (n == sz-1){ 2483 tmp0 = x[*idx]; 2484 sum1 -= *v1*tmp0; 2485 sum2 -= *v2*tmp0; 2486 sum3 -= *v3*tmp0; 2487 sum4 -= *v4*tmp0; 2488 sum5 -= *v5*tmp0; 2489 } 2490 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2491 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2492 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2493 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2494 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2495 break; 2496 default: 2497 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2498 } 2499 } 2500 2501 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2502 } 2503 its--; 2504 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2505 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2506 } 2507 if (flag & SOR_EISENSTAT) { 2508 const PetscScalar *b; 2509 MatScalar *t = a->inode.ssor_work; 2510 2511 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2512 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2513 /* 2514 Apply (U + D)^-1 where D is now the block diagonal 2515 */ 2516 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2517 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2518 ibdiag -= sizes[i]*sizes[i]; 2519 sz = ii[row+1] - diag[row] - 1; 2520 v1 = a->a + diag[row] + 1; 2521 idx = a->j + diag[row] + 1; 2522 CHKMEMQ; 2523 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2524 switch (sizes[i]){ 2525 case 1: 2526 2527 sum1 = b[row]; 2528 for(n = 0; n<sz-1; n+=2) { 2529 i1 = idx[0]; 2530 i2 = idx[1]; 2531 idx += 2; 2532 tmp0 = x[i1]; 2533 tmp1 = x[i2]; 2534 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2535 } 2536 2537 if (n == sz-1){ 2538 tmp0 = x[*idx]; 2539 sum1 -= *v1*tmp0; 2540 } 2541 x[row] = sum1*(*ibdiag);row--; 2542 break; 2543 2544 case 2: 2545 2546 sum1 = b[row]; 2547 sum2 = b[row-1]; 2548 /* note that sum1 is associated with the second of the two rows */ 2549 v2 = a->a + diag[row-1] + 2; 2550 for(n = 0; n<sz-1; n+=2) { 2551 i1 = idx[0]; 2552 i2 = idx[1]; 2553 idx += 2; 2554 tmp0 = x[i1]; 2555 tmp1 = x[i2]; 2556 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2557 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2558 } 2559 2560 if (n == sz-1){ 2561 tmp0 = x[*idx]; 2562 sum1 -= *v1*tmp0; 2563 sum2 -= *v2*tmp0; 2564 } 2565 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2566 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2567 row -= 2; 2568 break; 2569 case 3: 2570 2571 sum1 = b[row]; 2572 sum2 = b[row-1]; 2573 sum3 = b[row-2]; 2574 v2 = a->a + diag[row-1] + 2; 2575 v3 = a->a + diag[row-2] + 3; 2576 for(n = 0; n<sz-1; n+=2) { 2577 i1 = idx[0]; 2578 i2 = idx[1]; 2579 idx += 2; 2580 tmp0 = x[i1]; 2581 tmp1 = x[i2]; 2582 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2583 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2584 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2585 } 2586 2587 if (n == sz-1){ 2588 tmp0 = x[*idx]; 2589 sum1 -= *v1*tmp0; 2590 sum2 -= *v2*tmp0; 2591 sum3 -= *v3*tmp0; 2592 } 2593 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2594 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2595 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2596 row -= 3; 2597 break; 2598 case 4: 2599 2600 sum1 = b[row]; 2601 sum2 = b[row-1]; 2602 sum3 = b[row-2]; 2603 sum4 = b[row-3]; 2604 v2 = a->a + diag[row-1] + 2; 2605 v3 = a->a + diag[row-2] + 3; 2606 v4 = a->a + diag[row-3] + 4; 2607 for(n = 0; n<sz-1; n+=2) { 2608 i1 = idx[0]; 2609 i2 = idx[1]; 2610 idx += 2; 2611 tmp0 = x[i1]; 2612 tmp1 = x[i2]; 2613 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2614 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2615 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2616 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2617 } 2618 2619 if (n == sz-1){ 2620 tmp0 = x[*idx]; 2621 sum1 -= *v1*tmp0; 2622 sum2 -= *v2*tmp0; 2623 sum3 -= *v3*tmp0; 2624 sum4 -= *v4*tmp0; 2625 } 2626 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2627 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2628 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2629 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2630 row -= 4; 2631 break; 2632 case 5: 2633 2634 sum1 = b[row]; 2635 sum2 = b[row-1]; 2636 sum3 = b[row-2]; 2637 sum4 = b[row-3]; 2638 sum5 = b[row-4]; 2639 v2 = a->a + diag[row-1] + 2; 2640 v3 = a->a + diag[row-2] + 3; 2641 v4 = a->a + diag[row-3] + 4; 2642 v5 = a->a + diag[row-4] + 5; 2643 for(n = 0; n<sz-1; n+=2) { 2644 i1 = idx[0]; 2645 i2 = idx[1]; 2646 idx += 2; 2647 tmp0 = x[i1]; 2648 tmp1 = x[i2]; 2649 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2650 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2651 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2652 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2653 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2654 } 2655 2656 if (n == sz-1){ 2657 tmp0 = x[*idx]; 2658 sum1 -= *v1*tmp0; 2659 sum2 -= *v2*tmp0; 2660 sum3 -= *v3*tmp0; 2661 sum4 -= *v4*tmp0; 2662 sum5 -= *v5*tmp0; 2663 } 2664 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2665 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2666 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2667 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2668 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2669 row -= 5; 2670 break; 2671 default: 2672 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2673 } 2674 CHKMEMQ; 2675 } 2676 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2677 2678 /* 2679 t = b - D x where D is the block diagonal 2680 */ 2681 cnt = 0; 2682 for (i=0, row=0; i<m; i++) { 2683 CHKMEMQ; 2684 switch (sizes[i]){ 2685 case 1: 2686 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2687 break; 2688 case 2: 2689 x1 = x[row]; x2 = x[row+1]; 2690 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2691 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2692 t[row] = b[row] - tmp1; 2693 t[row+1] = b[row+1] - tmp2; row += 2; 2694 cnt += 4; 2695 break; 2696 case 3: 2697 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2698 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2699 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2700 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2701 t[row] = b[row] - tmp1; 2702 t[row+1] = b[row+1] - tmp2; 2703 t[row+2] = b[row+2] - tmp3; row += 3; 2704 cnt += 9; 2705 break; 2706 case 4: 2707 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2708 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2709 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2710 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2711 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2712 t[row] = b[row] - tmp1; 2713 t[row+1] = b[row+1] - tmp2; 2714 t[row+2] = b[row+2] - tmp3; 2715 t[row+3] = b[row+3] - tmp4; row += 4; 2716 cnt += 16; 2717 break; 2718 case 5: 2719 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2720 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2721 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2722 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2723 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2724 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2725 t[row] = b[row] - tmp1; 2726 t[row+1] = b[row+1] - tmp2; 2727 t[row+2] = b[row+2] - tmp3; 2728 t[row+3] = b[row+3] - tmp4; 2729 t[row+4] = b[row+4] - tmp5;row += 5; 2730 cnt += 25; 2731 break; 2732 default: 2733 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2734 } 2735 CHKMEMQ; 2736 } 2737 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2738 2739 2740 2741 /* 2742 Apply (L + D)^-1 where D is the block diagonal 2743 */ 2744 for (i=0, row=0; i<m; i++) { 2745 sz = diag[row] - ii[row]; 2746 v1 = a->a + ii[row]; 2747 idx = a->j + ii[row]; 2748 CHKMEMQ; 2749 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2750 switch (sizes[i]){ 2751 case 1: 2752 2753 sum1 = t[row]; 2754 for(n = 0; n<sz-1; n+=2) { 2755 i1 = idx[0]; 2756 i2 = idx[1]; 2757 idx += 2; 2758 tmp0 = t[i1]; 2759 tmp1 = t[i2]; 2760 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2761 } 2762 2763 if (n == sz-1){ 2764 tmp0 = t[*idx]; 2765 sum1 -= *v1 * tmp0; 2766 } 2767 x[row] += t[row] = sum1*(*ibdiag++); row++; 2768 break; 2769 case 2: 2770 v2 = a->a + ii[row+1]; 2771 sum1 = t[row]; 2772 sum2 = t[row+1]; 2773 for(n = 0; n<sz-1; n+=2) { 2774 i1 = idx[0]; 2775 i2 = idx[1]; 2776 idx += 2; 2777 tmp0 = t[i1]; 2778 tmp1 = t[i2]; 2779 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2780 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2781 } 2782 2783 if (n == sz-1){ 2784 tmp0 = t[*idx]; 2785 sum1 -= v1[0] * tmp0; 2786 sum2 -= v2[0] * tmp0; 2787 } 2788 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2789 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2790 ibdiag += 4; row += 2; 2791 break; 2792 case 3: 2793 v2 = a->a + ii[row+1]; 2794 v3 = a->a + ii[row+2]; 2795 sum1 = t[row]; 2796 sum2 = t[row+1]; 2797 sum3 = t[row+2]; 2798 for(n = 0; n<sz-1; n+=2) { 2799 i1 = idx[0]; 2800 i2 = idx[1]; 2801 idx += 2; 2802 tmp0 = t[i1]; 2803 tmp1 = t[i2]; 2804 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2805 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2806 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2807 } 2808 2809 if (n == sz-1){ 2810 tmp0 = t[*idx]; 2811 sum1 -= v1[0] * tmp0; 2812 sum2 -= v2[0] * tmp0; 2813 sum3 -= v3[0] * tmp0; 2814 } 2815 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2816 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2817 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2818 ibdiag += 9; row += 3; 2819 break; 2820 case 4: 2821 v2 = a->a + ii[row+1]; 2822 v3 = a->a + ii[row+2]; 2823 v4 = a->a + ii[row+3]; 2824 sum1 = t[row]; 2825 sum2 = t[row+1]; 2826 sum3 = t[row+2]; 2827 sum4 = t[row+3]; 2828 for(n = 0; n<sz-1; n+=2) { 2829 i1 = idx[0]; 2830 i2 = idx[1]; 2831 idx += 2; 2832 tmp0 = t[i1]; 2833 tmp1 = t[i2]; 2834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2835 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2836 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2837 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2838 } 2839 2840 if (n == sz-1){ 2841 tmp0 = t[*idx]; 2842 sum1 -= v1[0] * tmp0; 2843 sum2 -= v2[0] * tmp0; 2844 sum3 -= v3[0] * tmp0; 2845 sum4 -= v4[0] * tmp0; 2846 } 2847 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2848 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2849 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2850 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2851 ibdiag += 16; row += 4; 2852 break; 2853 case 5: 2854 v2 = a->a + ii[row+1]; 2855 v3 = a->a + ii[row+2]; 2856 v4 = a->a + ii[row+3]; 2857 v5 = a->a + ii[row+4]; 2858 sum1 = t[row]; 2859 sum2 = t[row+1]; 2860 sum3 = t[row+2]; 2861 sum4 = t[row+3]; 2862 sum5 = t[row+4]; 2863 for(n = 0; n<sz-1; n+=2) { 2864 i1 = idx[0]; 2865 i2 = idx[1]; 2866 idx += 2; 2867 tmp0 = t[i1]; 2868 tmp1 = t[i2]; 2869 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2871 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2872 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2873 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2874 } 2875 2876 if (n == sz-1){ 2877 tmp0 = t[*idx]; 2878 sum1 -= v1[0] * tmp0; 2879 sum2 -= v2[0] * tmp0; 2880 sum3 -= v3[0] * tmp0; 2881 sum4 -= v4[0] * tmp0; 2882 sum5 -= v5[0] * tmp0; 2883 } 2884 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2885 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2886 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2887 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2888 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2889 ibdiag += 25; row += 5; 2890 break; 2891 default: 2892 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2893 } 2894 CHKMEMQ; 2895 } 2896 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2897 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2898 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2899 } 2900 PetscFunctionReturn(0); 2901 } 2902 2903 #undef __FUNCT__ 2904 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 2905 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2906 { 2907 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2908 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2909 const MatScalar *bdiag = a->inode.bdiag; 2910 const PetscScalar *b; 2911 PetscErrorCode ierr; 2912 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2913 const PetscInt *sizes = a->inode.size; 2914 2915 PetscFunctionBegin; 2916 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2917 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2918 cnt = 0; 2919 for (i=0, row=0; i<m; i++) { 2920 switch (sizes[i]){ 2921 case 1: 2922 x[row] = b[row]*bdiag[cnt++];row++; 2923 break; 2924 case 2: 2925 x1 = b[row]; x2 = b[row+1]; 2926 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2927 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2928 x[row++] = tmp1; 2929 x[row++] = tmp2; 2930 cnt += 4; 2931 break; 2932 case 3: 2933 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2934 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2935 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2936 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2937 x[row++] = tmp1; 2938 x[row++] = tmp2; 2939 x[row++] = tmp3; 2940 cnt += 9; 2941 break; 2942 case 4: 2943 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2944 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2945 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2946 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2947 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2948 x[row++] = tmp1; 2949 x[row++] = tmp2; 2950 x[row++] = tmp3; 2951 x[row++] = tmp4; 2952 cnt += 16; 2953 break; 2954 case 5: 2955 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2956 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2957 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2958 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2959 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2960 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2961 x[row++] = tmp1; 2962 x[row++] = tmp2; 2963 x[row++] = tmp3; 2964 x[row++] = tmp4; 2965 x[row++] = tmp5; 2966 cnt += 25; 2967 break; 2968 default: 2969 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2970 } 2971 } 2972 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2973 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2974 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2975 PetscFunctionReturn(0); 2976 } 2977 2978 /* 2979 samestructure indicates that the matrix has not changed its nonzero structure so we 2980 do not need to recompute the inodes 2981 */ 2982 #undef __FUNCT__ 2983 #define __FUNCT__ "Mat_CheckInode" 2984 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2985 { 2986 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2987 PetscErrorCode ierr; 2988 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2989 PetscTruth flag; 2990 2991 PetscFunctionBegin; 2992 if (!a->inode.use) PetscFunctionReturn(0); 2993 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2994 2995 2996 m = A->rmap->n; 2997 if (a->inode.size) {ns = a->inode.size;} 2998 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2999 3000 i = 0; 3001 node_count = 0; 3002 idx = a->j; 3003 ii = a->i; 3004 while (i < m){ /* For each row */ 3005 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3006 /* Limits the number of elements in a node to 'a->inode.limit' */ 3007 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3008 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3009 if (nzy != nzx) break; 3010 idy += nzx; /* Same nonzero pattern */ 3011 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3012 if (!flag) break; 3013 } 3014 ns[node_count++] = blk_size; 3015 idx += blk_size*nzx; 3016 i = j; 3017 } 3018 /* If not enough inodes found,, do not use inode version of the routines */ 3019 if (!a->inode.size && m && node_count > .9*m) { 3020 ierr = PetscFree(ns);CHKERRQ(ierr); 3021 a->inode.node_count = 0; 3022 a->inode.size = PETSC_NULL; 3023 a->inode.use = PETSC_FALSE; 3024 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3025 } else { 3026 A->ops->mult = MatMult_SeqAIJ_Inode; 3027 A->ops->sor = MatSOR_SeqAIJ_Inode; 3028 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3029 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3030 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3031 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3032 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3033 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3034 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3035 a->inode.node_count = node_count; 3036 a->inode.size = ns; 3037 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3038 } 3039 PetscFunctionReturn(0); 3040 } 3041 3042 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3043 PetscInt __k, *__vi; \ 3044 __vi = aj + ai[row]; \ 3045 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3046 __vi = aj + adiag[row]; \ 3047 cols[nzl] = __vi[0];\ 3048 __vi = aj + adiag[row+1]+1;\ 3049 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3050 3051 3052 /* 3053 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3054 Modified from Mat_CheckInode(). 3055 3056 Input Parameters: 3057 + Mat A - ILU or LU matrix factor 3058 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3059 do not need to recompute the inodes 3060 */ 3061 #undef __FUNCT__ 3062 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3063 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3064 { 3065 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3066 PetscErrorCode ierr; 3067 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3068 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3069 PetscTruth flag; 3070 3071 PetscFunctionBegin; 3072 if (!a->inode.use) PetscFunctionReturn(0); 3073 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3074 3075 m = A->rmap->n; 3076 if (a->inode.size) {ns = a->inode.size;} 3077 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3078 3079 i = 0; 3080 node_count = 0; 3081 while (i < m){ /* For each row */ 3082 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3083 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3084 nzx = nzl1 + nzu1 + 1; 3085 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3086 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3087 3088 /* Limits the number of elements in a node to 'a->inode.limit' */ 3089 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3090 nzl2 = ai[j+1] - ai[j]; 3091 nzu2 = adiag[j] - adiag[j+1] - 1; 3092 nzy = nzl2 + nzu2 + 1; 3093 if( nzy != nzx) break; 3094 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3095 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3096 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3097 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3098 ierr = PetscFree(cols2);CHKERRQ(ierr); 3099 } 3100 ns[node_count++] = blk_size; 3101 ierr = PetscFree(cols1);CHKERRQ(ierr); 3102 i = j; 3103 } 3104 /* If not enough inodes found,, do not use inode version of the routines */ 3105 if (!a->inode.size && m && node_count > .9*m) { 3106 ierr = PetscFree(ns);CHKERRQ(ierr); 3107 a->inode.node_count = 0; 3108 a->inode.size = PETSC_NULL; 3109 a->inode.use = PETSC_FALSE; 3110 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3111 } else { 3112 A->ops->mult = 0; 3113 A->ops->sor = 0; 3114 A->ops->multadd = 0; 3115 A->ops->getrowij = 0; 3116 A->ops->restorerowij = 0; 3117 A->ops->getcolumnij = 0; 3118 A->ops->restorecolumnij = 0; 3119 A->ops->coloringpatch = 0; 3120 A->ops->multdiagonalblock = 0; 3121 a->inode.node_count = node_count; 3122 a->inode.size = ns; 3123 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3124 } 3125 PetscFunctionReturn(0); 3126 } 3127 3128 /* 3129 This is really ugly. if inodes are used this replaces the 3130 permutations with ones that correspond to rows/cols of the matrix 3131 rather then inode blocks 3132 */ 3133 #undef __FUNCT__ 3134 #define __FUNCT__ "MatInodeAdjustForInodes" 3135 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3136 { 3137 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3138 3139 PetscFunctionBegin; 3140 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3141 if (f) { 3142 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3143 } 3144 PetscFunctionReturn(0); 3145 } 3146 3147 EXTERN_C_BEGIN 3148 #undef __FUNCT__ 3149 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3150 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3151 { 3152 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3153 PetscErrorCode ierr; 3154 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3155 const PetscInt *ridx,*cidx; 3156 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3157 PetscInt nslim_col,*ns_col; 3158 IS ris = *rperm,cis = *cperm; 3159 3160 PetscFunctionBegin; 3161 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3162 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3163 3164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3165 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3166 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3167 3168 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3169 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3170 3171 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3172 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3173 3174 /* Construct the permutations for rows*/ 3175 for (i=0,row = 0; i<nslim_row; ++i){ 3176 indx = ridx[i]; 3177 start_val = tns[indx]; 3178 end_val = tns[indx + 1]; 3179 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3180 } 3181 3182 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3183 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3184 3185 /* Construct permutations for columns */ 3186 for (i=0,col=0; i<nslim_col; ++i){ 3187 indx = cidx[i]; 3188 start_val = tns[indx]; 3189 end_val = tns[indx + 1]; 3190 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3191 } 3192 3193 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3194 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3195 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3196 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3197 3198 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3199 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3200 3201 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3202 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3203 ierr = ISDestroy(cis);CHKERRQ(ierr); 3204 ierr = ISDestroy(ris);CHKERRQ(ierr); 3205 ierr = PetscFree(tns);CHKERRQ(ierr); 3206 PetscFunctionReturn(0); 3207 } 3208 EXTERN_C_END 3209 3210 #undef __FUNCT__ 3211 #define __FUNCT__ "MatInodeGetInodeSizes" 3212 /*@C 3213 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3214 3215 Collective on Mat 3216 3217 Input Parameter: 3218 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3219 3220 Output Parameter: 3221 + node_count - no of inodes present in the matrix. 3222 . sizes - an array of size node_count,with sizes of each inode. 3223 - limit - the max size used to generate the inodes. 3224 3225 Level: advanced 3226 3227 Notes: This routine returns some internal storage information 3228 of the matrix, it is intended to be used by advanced users. 3229 It should be called after the matrix is assembled. 3230 The contents of the sizes[] array should not be changed. 3231 PETSC_NULL may be passed for information not requested. 3232 3233 .keywords: matrix, seqaij, get, inode 3234 3235 .seealso: MatGetInfo() 3236 @*/ 3237 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3238 { 3239 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3240 3241 PetscFunctionBegin; 3242 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3243 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3244 if (f) { 3245 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3246 } 3247 PetscFunctionReturn(0); 3248 } 3249 3250 EXTERN_C_BEGIN 3251 #undef __FUNCT__ 3252 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3253 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3254 { 3255 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3256 3257 PetscFunctionBegin; 3258 if (node_count) *node_count = a->inode.node_count; 3259 if (sizes) *sizes = a->inode.size; 3260 if (limit) *limit = a->inode.limit; 3261 PetscFunctionReturn(0); 3262 } 3263 EXTERN_C_END 3264