1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 781 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 819 switch (nsz){ /* Each loop in 'case' is unrolled */ 820 case 1 : 821 sum1 = b[*r++]; 822 for(j=0; j<nz-1; j+=2){ 823 i0 = vi[0]; 824 i1 = vi[1]; 825 vi +=2; 826 tmp0 = tmps[i0]; 827 tmp1 = tmps[i1]; 828 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 829 } 830 if(j == nz-1){ 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 } 834 tmp[row ++]=sum1; 835 break; 836 case 2: 837 sum1 = b[*r++]; 838 sum2 = b[*r++]; 839 v2 = aa + ai[row+1]; 840 841 for(j=0; j<nz-1; j+=2){ 842 i0 = vi[0]; 843 i1 = vi[1]; 844 vi +=2; 845 tmp0 = tmps[i0]; 846 tmp1 = tmps[i1]; 847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 848 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 849 } 850 if(j == nz-1){ 851 tmp0 = tmps[*vi++]; 852 sum1 -= *v1++ *tmp0; 853 sum2 -= *v2++ *tmp0; 854 } 855 sum2 -= *v2++ * sum1; 856 tmp[row ++]=sum1; 857 tmp[row ++]=sum2; 858 break; 859 case 3: 860 sum1 = b[*r++]; 861 sum2 = b[*r++]; 862 sum3 = b[*r++]; 863 v2 = aa + ai[row+1]; 864 v3 = aa + ai[row+2]; 865 866 for (j=0; j<nz-1; j+=2){ 867 i0 = vi[0]; 868 i1 = vi[1]; 869 vi +=2; 870 tmp0 = tmps[i0]; 871 tmp1 = tmps[i1]; 872 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 873 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 874 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 875 } 876 if (j == nz-1){ 877 tmp0 = tmps[*vi++]; 878 sum1 -= *v1++ *tmp0; 879 sum2 -= *v2++ *tmp0; 880 sum3 -= *v3++ *tmp0; 881 } 882 sum2 -= *v2++ * sum1; 883 sum3 -= *v3++ * sum1; 884 sum3 -= *v3++ * sum2; 885 tmp[row ++]=sum1; 886 tmp[row ++]=sum2; 887 tmp[row ++]=sum3; 888 break; 889 890 case 4: 891 sum1 = b[*r++]; 892 sum2 = b[*r++]; 893 sum3 = b[*r++]; 894 sum4 = b[*r++]; 895 v2 = aa + ai[row+1]; 896 v3 = aa + ai[row+2]; 897 v4 = aa + ai[row+3]; 898 899 for (j=0; j<nz-1; j+=2){ 900 i0 = vi[0]; 901 i1 = vi[1]; 902 vi +=2; 903 tmp0 = tmps[i0]; 904 tmp1 = tmps[i1]; 905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 906 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 907 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 908 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 909 } 910 if (j == nz-1){ 911 tmp0 = tmps[*vi++]; 912 sum1 -= *v1++ *tmp0; 913 sum2 -= *v2++ *tmp0; 914 sum3 -= *v3++ *tmp0; 915 sum4 -= *v4++ *tmp0; 916 } 917 sum2 -= *v2++ * sum1; 918 sum3 -= *v3++ * sum1; 919 sum4 -= *v4++ * sum1; 920 sum3 -= *v3++ * sum2; 921 sum4 -= *v4++ * sum2; 922 sum4 -= *v4++ * sum3; 923 924 tmp[row ++]=sum1; 925 tmp[row ++]=sum2; 926 tmp[row ++]=sum3; 927 tmp[row ++]=sum4; 928 break; 929 case 5: 930 sum1 = b[*r++]; 931 sum2 = b[*r++]; 932 sum3 = b[*r++]; 933 sum4 = b[*r++]; 934 sum5 = b[*r++]; 935 v2 = aa + ai[row+1]; 936 v3 = aa + ai[row+2]; 937 v4 = aa + ai[row+3]; 938 v5 = aa + ai[row+4]; 939 940 for (j=0; j<nz-1; j+=2){ 941 i0 = vi[0]; 942 i1 = vi[1]; 943 vi +=2; 944 tmp0 = tmps[i0]; 945 tmp1 = tmps[i1]; 946 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 947 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 948 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 949 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 950 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 951 } 952 if (j == nz-1){ 953 tmp0 = tmps[*vi++]; 954 sum1 -= *v1++ *tmp0; 955 sum2 -= *v2++ *tmp0; 956 sum3 -= *v3++ *tmp0; 957 sum4 -= *v4++ *tmp0; 958 sum5 -= *v5++ *tmp0; 959 } 960 961 sum2 -= *v2++ * sum1; 962 sum3 -= *v3++ * sum1; 963 sum4 -= *v4++ * sum1; 964 sum5 -= *v5++ * sum1; 965 sum3 -= *v3++ * sum2; 966 sum4 -= *v4++ * sum2; 967 sum5 -= *v5++ * sum2; 968 sum4 -= *v4++ * sum3; 969 sum5 -= *v5++ * sum3; 970 sum5 -= *v5++ * sum4; 971 972 tmp[row ++]=sum1; 973 tmp[row ++]=sum2; 974 tmp[row ++]=sum3; 975 tmp[row ++]=sum4; 976 tmp[row ++]=sum5; 977 break; 978 default: 979 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 980 } 981 } 982 /* backward solve the upper triangular */ 983 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 984 nsz = ns[i]; 985 aii = ai[row+1] -1; 986 v1 = aa + aii; 987 vi = aj + aii; 988 nz = aii- ad[row]; 989 switch (nsz){ /* Each loop in 'case' is unrolled */ 990 case 1 : 991 sum1 = tmp[row]; 992 993 for(j=nz ; j>1; j-=2){ 994 vi -=2; 995 i0 = vi[2]; 996 i1 = vi[1]; 997 tmp0 = tmps[i0]; 998 tmp1 = tmps[i1]; 999 v1 -= 2; 1000 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1001 } 1002 if (j==1){ 1003 tmp0 = tmps[*vi--]; 1004 sum1 -= *v1-- * tmp0; 1005 } 1006 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1007 break; 1008 case 2 : 1009 sum1 = tmp[row]; 1010 sum2 = tmp[row -1]; 1011 v2 = aa + ai[row]-1; 1012 for (j=nz ; j>1; j-=2){ 1013 vi -=2; 1014 i0 = vi[2]; 1015 i1 = vi[1]; 1016 tmp0 = tmps[i0]; 1017 tmp1 = tmps[i1]; 1018 v1 -= 2; 1019 v2 -= 2; 1020 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1021 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1022 } 1023 if (j==1){ 1024 tmp0 = tmps[*vi--]; 1025 sum1 -= *v1-- * tmp0; 1026 sum2 -= *v2-- * tmp0; 1027 } 1028 1029 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1030 sum2 -= *v2-- * tmp0; 1031 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1032 break; 1033 case 3 : 1034 sum1 = tmp[row]; 1035 sum2 = tmp[row -1]; 1036 sum3 = tmp[row -2]; 1037 v2 = aa + ai[row]-1; 1038 v3 = aa + ai[row -1]-1; 1039 for (j=nz ; j>1; j-=2){ 1040 vi -=2; 1041 i0 = vi[2]; 1042 i1 = vi[1]; 1043 tmp0 = tmps[i0]; 1044 tmp1 = tmps[i1]; 1045 v1 -= 2; 1046 v2 -= 2; 1047 v3 -= 2; 1048 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1049 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1050 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1051 } 1052 if (j==1){ 1053 tmp0 = tmps[*vi--]; 1054 sum1 -= *v1-- * tmp0; 1055 sum2 -= *v2-- * tmp0; 1056 sum3 -= *v3-- * tmp0; 1057 } 1058 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1059 sum2 -= *v2-- * tmp0; 1060 sum3 -= *v3-- * tmp0; 1061 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1062 sum3 -= *v3-- * tmp0; 1063 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1064 1065 break; 1066 case 4 : 1067 sum1 = tmp[row]; 1068 sum2 = tmp[row -1]; 1069 sum3 = tmp[row -2]; 1070 sum4 = tmp[row -3]; 1071 v2 = aa + ai[row]-1; 1072 v3 = aa + ai[row -1]-1; 1073 v4 = aa + ai[row -2]-1; 1074 1075 for (j=nz ; j>1; j-=2){ 1076 vi -=2; 1077 i0 = vi[2]; 1078 i1 = vi[1]; 1079 tmp0 = tmps[i0]; 1080 tmp1 = tmps[i1]; 1081 v1 -= 2; 1082 v2 -= 2; 1083 v3 -= 2; 1084 v4 -= 2; 1085 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1086 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1087 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1088 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1089 } 1090 if (j==1){ 1091 tmp0 = tmps[*vi--]; 1092 sum1 -= *v1-- * tmp0; 1093 sum2 -= *v2-- * tmp0; 1094 sum3 -= *v3-- * tmp0; 1095 sum4 -= *v4-- * tmp0; 1096 } 1097 1098 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1099 sum2 -= *v2-- * tmp0; 1100 sum3 -= *v3-- * tmp0; 1101 sum4 -= *v4-- * tmp0; 1102 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1103 sum3 -= *v3-- * tmp0; 1104 sum4 -= *v4-- * tmp0; 1105 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1106 sum4 -= *v4-- * tmp0; 1107 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1108 break; 1109 case 5 : 1110 sum1 = tmp[row]; 1111 sum2 = tmp[row -1]; 1112 sum3 = tmp[row -2]; 1113 sum4 = tmp[row -3]; 1114 sum5 = tmp[row -4]; 1115 v2 = aa + ai[row]-1; 1116 v3 = aa + ai[row -1]-1; 1117 v4 = aa + ai[row -2]-1; 1118 v5 = aa + ai[row -3]-1; 1119 for (j=nz ; j>1; j-=2){ 1120 vi -= 2; 1121 i0 = vi[2]; 1122 i1 = vi[1]; 1123 tmp0 = tmps[i0]; 1124 tmp1 = tmps[i1]; 1125 v1 -= 2; 1126 v2 -= 2; 1127 v3 -= 2; 1128 v4 -= 2; 1129 v5 -= 2; 1130 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1131 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1132 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1133 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1134 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1135 } 1136 if (j==1){ 1137 tmp0 = tmps[*vi--]; 1138 sum1 -= *v1-- * tmp0; 1139 sum2 -= *v2-- * tmp0; 1140 sum3 -= *v3-- * tmp0; 1141 sum4 -= *v4-- * tmp0; 1142 sum5 -= *v5-- * tmp0; 1143 } 1144 1145 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1151 sum3 -= *v3-- * tmp0; 1152 sum4 -= *v4-- * tmp0; 1153 sum5 -= *v5-- * tmp0; 1154 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1158 sum5 -= *v5-- * tmp0; 1159 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1160 break; 1161 default: 1162 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1163 } 1164 } 1165 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1166 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1169 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 /* ----------------------------------------------------------- */ 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_newdatastruct" 1176 PetscErrorCode MatSolve_SeqAIJ_Inode_newdatastruct(Mat A,Vec bb,Vec xx) 1177 { 1178 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1179 IS iscol = a->col,isrow = a->row; 1180 PetscErrorCode ierr; 1181 const PetscInt *r,*c,*rout,*cout; 1182 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 1183 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 1184 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 1185 PetscScalar sum1,sum2,sum3,sum4,sum5; 1186 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 1187 const PetscScalar *b; 1188 1189 PetscFunctionBegin; 1190 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 1191 node_max = a->inode.node_count; 1192 ns = a->inode.size; /* Node Size array */ 1193 1194 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1195 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1196 tmp = a->solve_work; 1197 1198 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1199 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1200 1201 /* forward solve the lower triangular */ 1202 tmps = tmp ; 1203 aa = a_a ; 1204 aj = a_j ; 1205 ad = a->diag; 1206 1207 for (i = 0,row = 0; i< node_max; ++i){ 1208 nsz = ns[i]; 1209 aii = ai[row]; 1210 v1 = aa + aii; 1211 vi = aj + aii; 1212 nz = ai[row+1]- ai[row]; 1213 1214 switch (nsz){ /* Each loop in 'case' is unrolled */ 1215 case 1 : 1216 sum1 = b[r[row]]; 1217 for(j=0; j<nz-1; j+=2){ 1218 i0 = vi[j]; 1219 i1 = vi[j+1]; 1220 tmp0 = tmps[i0]; 1221 tmp1 = tmps[i1]; 1222 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 1223 } 1224 if(j == nz-1){ 1225 tmp0 = tmps[vi[j]]; 1226 sum1 -= v1[j]*tmp0; 1227 } 1228 tmp[row++]=sum1; 1229 break; 1230 case 2: 1231 sum1 = b[r[row]]; 1232 sum2 = b[r[row+1]]; 1233 v2 = aa + ai[row+1]; 1234 1235 for(j=0; j<nz-1; j+=2){ 1236 i0 = vi[j]; 1237 i1 = vi[j+1]; 1238 tmp0 = tmps[i0]; 1239 tmp1 = tmps[i1]; 1240 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1241 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1242 } 1243 if(j == nz-1){ 1244 tmp0 = tmps[vi[j]]; 1245 sum1 -= v1[j] *tmp0; 1246 sum2 -= v2[j] *tmp0; 1247 } 1248 sum2 -= v2[nz] * sum1; 1249 tmp[row ++]=sum1; 1250 tmp[row ++]=sum2; 1251 break; 1252 case 3: 1253 sum1 = b[r[row]]; 1254 sum2 = b[r[row+1]]; 1255 sum3 = b[r[row+2]]; 1256 v2 = aa + ai[row+1]; 1257 v3 = aa + ai[row+2]; 1258 1259 for (j=0; j<nz-1; j+=2){ 1260 i0 = vi[j]; 1261 i1 = vi[j+1]; 1262 tmp0 = tmps[i0]; 1263 tmp1 = tmps[i1]; 1264 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1265 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1266 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1267 } 1268 if (j == nz-1){ 1269 tmp0 = tmps[vi[j]]; 1270 sum1 -= v1[j] *tmp0; 1271 sum2 -= v2[j] *tmp0; 1272 sum3 -= v3[j] *tmp0; 1273 } 1274 sum2 -= v2[nz] * sum1; 1275 sum3 -= v3[nz] * sum1; 1276 sum3 -= v3[nz+1] * sum2; 1277 tmp[row ++]=sum1; 1278 tmp[row ++]=sum2; 1279 tmp[row ++]=sum3; 1280 break; 1281 1282 case 4: 1283 sum1 = b[r[row]]; 1284 sum2 = b[r[row+1]]; 1285 sum3 = b[r[row+2]]; 1286 sum4 = b[r[row+3]]; 1287 v2 = aa + ai[row+1]; 1288 v3 = aa + ai[row+2]; 1289 v4 = aa + ai[row+3]; 1290 1291 for (j=0; j<nz-1; j+=2){ 1292 i0 = vi[j]; 1293 i1 = vi[j+1]; 1294 tmp0 = tmps[i0]; 1295 tmp1 = tmps[i1]; 1296 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1297 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1298 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1299 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1300 } 1301 if (j == nz-1){ 1302 tmp0 = tmps[vi[j]]; 1303 sum1 -= v1[j] *tmp0; 1304 sum2 -= v2[j] *tmp0; 1305 sum3 -= v3[j] *tmp0; 1306 sum4 -= v4[j] *tmp0; 1307 } 1308 sum2 -= v2[nz] * sum1; 1309 sum3 -= v3[nz] * sum1; 1310 sum4 -= v4[nz] * sum1; 1311 sum3 -= v3[nz+1] * sum2; 1312 sum4 -= v4[nz+1] * sum2; 1313 sum4 -= v4[nz+2] * sum3; 1314 1315 tmp[row ++]=sum1; 1316 tmp[row ++]=sum2; 1317 tmp[row ++]=sum3; 1318 tmp[row ++]=sum4; 1319 break; 1320 case 5: 1321 sum1 = b[r[row]]; 1322 sum2 = b[r[row+1]]; 1323 sum3 = b[r[row+2]]; 1324 sum4 = b[r[row+3]]; 1325 sum5 = b[r[row+4]]; 1326 v2 = aa + ai[row+1]; 1327 v3 = aa + ai[row+2]; 1328 v4 = aa + ai[row+3]; 1329 v5 = aa + ai[row+4]; 1330 1331 for (j=0; j<nz-1; j+=2){ 1332 i0 = vi[j]; 1333 i1 = vi[j+1]; 1334 tmp0 = tmps[i0]; 1335 tmp1 = tmps[i1]; 1336 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1337 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1338 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1339 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1340 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 1341 } 1342 if (j == nz-1){ 1343 tmp0 = tmps[vi[j]]; 1344 sum1 -= v1[j] *tmp0; 1345 sum2 -= v2[j] *tmp0; 1346 sum3 -= v3[j] *tmp0; 1347 sum4 -= v4[j] *tmp0; 1348 sum5 -= v5[j] *tmp0; 1349 } 1350 1351 sum2 -= v2[nz] * sum1; 1352 sum3 -= v3[nz] * sum1; 1353 sum4 -= v4[nz] * sum1; 1354 sum5 -= v5[nz] * sum1; 1355 sum3 -= v3[nz+1] * sum2; 1356 sum4 -= v4[nz+1] * sum2; 1357 sum5 -= v5[nz+1] * sum2; 1358 sum4 -= v4[nz+2] * sum3; 1359 sum5 -= v5[nz+2] * sum3; 1360 sum5 -= v5[nz+3] * sum4; 1361 1362 tmp[row ++]=sum1; 1363 tmp[row ++]=sum2; 1364 tmp[row ++]=sum3; 1365 tmp[row ++]=sum4; 1366 tmp[row ++]=sum5; 1367 break; 1368 default: 1369 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1370 } 1371 } 1372 /* backward solve the upper triangular */ 1373 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 1374 nsz = ns[i]; 1375 aii = ad[row+1] + 1; 1376 v1 = aa + aii; 1377 vi = aj + aii; 1378 nz = ad[row]- ad[row+1] - 1; 1379 switch (nsz){ /* Each loop in 'case' is unrolled */ 1380 case 1 : 1381 sum1 = tmp[row]; 1382 1383 for(j=0 ; j<nz-1; j+=2){ 1384 i0 = vi[j]; 1385 i1 = vi[j+1]; 1386 tmp0 = tmps[i0]; 1387 tmp1 = tmps[i1]; 1388 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1389 } 1390 if (j == nz-1){ 1391 tmp0 = tmps[vi[j]]; 1392 sum1 -= v1[j]*tmp0; 1393 } 1394 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1395 break; 1396 case 2 : 1397 sum1 = tmp[row]; 1398 sum2 = tmp[row-1]; 1399 v2 = aa + ad[row] + 1; 1400 for (j=0 ; j<nz-1; j+=2){ 1401 i0 = vi[j]; 1402 i1 = vi[j+1]; 1403 tmp0 = tmps[i0]; 1404 tmp1 = tmps[i1]; 1405 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1406 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1407 } 1408 if (j == nz-1){ 1409 tmp0 = tmps[vi[j]]; 1410 sum1 -= v1[j]* tmp0; 1411 sum2 -= v2[j+1]* tmp0; 1412 } 1413 1414 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1415 sum2 -= v2[0] * tmp0; 1416 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1417 break; 1418 case 3 : 1419 sum1 = tmp[row]; 1420 sum2 = tmp[row -1]; 1421 sum3 = tmp[row -2]; 1422 v2 = aa + ad[row] + 1; 1423 v3 = aa + ad[row -1] + 1; 1424 for (j=0 ; j<nz-1; j+=2){ 1425 i0 = vi[j]; 1426 i1 = vi[j+1]; 1427 tmp0 = tmps[i0]; 1428 tmp1 = tmps[i1]; 1429 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1430 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1431 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1432 } 1433 if (j== nz-1){ 1434 tmp0 = tmps[vi[j]]; 1435 sum1 -= v1[j] * tmp0; 1436 sum2 -= v2[j+1] * tmp0; 1437 sum3 -= v3[j+2] * tmp0; 1438 } 1439 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1440 sum2 -= v2[0]* tmp0; 1441 sum3 -= v3[1] * tmp0; 1442 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1443 sum3 -= v3[0]* tmp0; 1444 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1445 1446 break; 1447 case 4 : 1448 sum1 = tmp[row]; 1449 sum2 = tmp[row -1]; 1450 sum3 = tmp[row -2]; 1451 sum4 = tmp[row -3]; 1452 v2 = aa + ad[row]+1; 1453 v3 = aa + ad[row -1]+1; 1454 v4 = aa + ad[row -2]+1; 1455 1456 for (j=0 ; j<nz-1; j+=2){ 1457 i0 = vi[j]; 1458 i1 = vi[j+1]; 1459 tmp0 = tmps[i0]; 1460 tmp1 = tmps[i1]; 1461 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1462 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1463 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1464 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1465 } 1466 if (j== nz-1){ 1467 tmp0 = tmps[vi[j]]; 1468 sum1 -= v1[j] * tmp0; 1469 sum2 -= v2[j+1] * tmp0; 1470 sum3 -= v3[j+2] * tmp0; 1471 sum4 -= v4[j+3] * tmp0; 1472 } 1473 1474 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1475 sum2 -= v2[0] * tmp0; 1476 sum3 -= v3[1] * tmp0; 1477 sum4 -= v4[2] * tmp0; 1478 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1479 sum3 -= v3[0] * tmp0; 1480 sum4 -= v4[1] * tmp0; 1481 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1482 sum4 -= v4[0] * tmp0; 1483 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1484 break; 1485 case 5 : 1486 sum1 = tmp[row]; 1487 sum2 = tmp[row -1]; 1488 sum3 = tmp[row -2]; 1489 sum4 = tmp[row -3]; 1490 sum5 = tmp[row -4]; 1491 v2 = aa + ad[row]+1; 1492 v3 = aa + ad[row -1]+1; 1493 v4 = aa + ad[row -2]+1; 1494 v5 = aa + ad[row -3]+1; 1495 for (j=0 ; j<nz-1; j+=2){ 1496 i0 = vi[j]; 1497 i1 = vi[j+1]; 1498 tmp0 = tmps[i0]; 1499 tmp1 = tmps[i1]; 1500 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1501 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1502 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1503 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1504 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 1505 } 1506 if (j==nz-1){ 1507 tmp0 = tmps[vi[j]]; 1508 sum1 -= v1[j] * tmp0; 1509 sum2 -= v2[j+1] * tmp0; 1510 sum3 -= v3[j+2] * tmp0; 1511 sum4 -= v4[j+3] * tmp0; 1512 sum5 -= v5[j+4] * tmp0; 1513 } 1514 1515 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1516 sum2 -= v2[0] * tmp0; 1517 sum3 -= v3[1] * tmp0; 1518 sum4 -= v4[2] * tmp0; 1519 sum5 -= v5[3] * tmp0; 1520 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1521 sum3 -= v3[0] * tmp0; 1522 sum4 -= v4[1] * tmp0; 1523 sum5 -= v5[2] * tmp0; 1524 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1525 sum4 -= v4[0] * tmp0; 1526 sum5 -= v5[1] * tmp0; 1527 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1528 sum5 -= v5[0] * tmp0; 1529 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 1530 break; 1531 default: 1532 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1533 } 1534 } 1535 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1536 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1537 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1538 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1539 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1540 PetscFunctionReturn(0); 1541 } 1542 1543 1544 /* 1545 Makes a longer coloring[] array and calls the usual code with that 1546 */ 1547 #undef __FUNCT__ 1548 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 1549 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1550 { 1551 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1552 PetscErrorCode ierr; 1553 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1554 PetscInt *colorused,i; 1555 ISColoringValue *newcolor; 1556 1557 PetscFunctionBegin; 1558 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1559 /* loop over inodes, marking a color for each column*/ 1560 row = 0; 1561 for (i=0; i<m; i++){ 1562 for (j=0; j<ns[i]; j++) { 1563 newcolor[row++] = coloring[i] + j*ncolors; 1564 } 1565 } 1566 1567 /* eliminate unneeded colors */ 1568 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1569 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1570 for (i=0; i<n; i++) { 1571 colorused[newcolor[i]] = 1; 1572 } 1573 1574 for (i=1; i<5*ncolors; i++) { 1575 colorused[i] += colorused[i-1]; 1576 } 1577 ncolors = colorused[5*ncolors-1]; 1578 for (i=0; i<n; i++) { 1579 newcolor[i] = colorused[newcolor[i]]-1; 1580 } 1581 ierr = PetscFree(colorused);CHKERRQ(ierr); 1582 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1583 ierr = PetscFree(coloring);CHKERRQ(ierr); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 #include "../src/mat/blockinvert.h" 1588 1589 #undef __FUNCT__ 1590 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 1591 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1592 { 1593 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1594 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1595 MatScalar *ibdiag,*bdiag; 1596 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1597 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1598 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1599 PetscErrorCode ierr; 1600 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1601 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1602 1603 PetscFunctionBegin; 1604 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1605 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1606 if (its > 1) { 1607 /* switch to non-inode version */ 1608 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 if (!a->inode.ibdiagvalid) { 1613 if (!a->inode.ibdiag) { 1614 /* calculate space needed for diagonal blocks */ 1615 for (i=0; i<m; i++) { 1616 cnt += sizes[i]*sizes[i]; 1617 } 1618 a->inode.bdiagsize = cnt; 1619 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1620 } 1621 1622 /* copy over the diagonal blocks and invert them */ 1623 ibdiag = a->inode.ibdiag; 1624 bdiag = a->inode.bdiag; 1625 cnt = 0; 1626 for (i=0, row = 0; i<m; i++) { 1627 for (j=0; j<sizes[i]; j++) { 1628 for (k=0; k<sizes[i]; k++) { 1629 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1630 } 1631 } 1632 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1633 1634 switch(sizes[i]) { 1635 case 1: 1636 /* Create matrix data structure */ 1637 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1638 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1639 break; 1640 case 2: 1641 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1642 break; 1643 case 3: 1644 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1645 break; 1646 case 4: 1647 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1648 break; 1649 case 5: 1650 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1651 break; 1652 default: 1653 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1654 } 1655 cnt += sizes[i]*sizes[i]; 1656 row += sizes[i]; 1657 } 1658 a->inode.ibdiagvalid = PETSC_TRUE; 1659 } 1660 ibdiag = a->inode.ibdiag; 1661 bdiag = a->inode.bdiag; 1662 1663 1664 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1665 if (flag & SOR_ZERO_INITIAL_GUESS) { 1666 PetscScalar *b; 1667 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1668 if (xx != bb) { 1669 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1670 } else { 1671 b = x; 1672 } 1673 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1674 1675 for (i=0, row=0; i<m; i++) { 1676 sz = diag[row] - ii[row]; 1677 v1 = a->a + ii[row]; 1678 idx = a->j + ii[row]; 1679 1680 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1681 switch (sizes[i]){ 1682 case 1: 1683 1684 sum1 = b[row]; 1685 for(n = 0; n<sz-1; n+=2) { 1686 i1 = idx[0]; 1687 i2 = idx[1]; 1688 idx += 2; 1689 tmp0 = x[i1]; 1690 tmp1 = x[i2]; 1691 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1692 } 1693 1694 if (n == sz-1){ 1695 tmp0 = x[*idx]; 1696 sum1 -= *v1 * tmp0; 1697 } 1698 x[row++] = sum1*(*ibdiag++); 1699 break; 1700 case 2: 1701 v2 = a->a + ii[row+1]; 1702 sum1 = b[row]; 1703 sum2 = b[row+1]; 1704 for(n = 0; n<sz-1; n+=2) { 1705 i1 = idx[0]; 1706 i2 = idx[1]; 1707 idx += 2; 1708 tmp0 = x[i1]; 1709 tmp1 = x[i2]; 1710 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1711 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1712 } 1713 1714 if (n == sz-1){ 1715 tmp0 = x[*idx]; 1716 sum1 -= v1[0] * tmp0; 1717 sum2 -= v2[0] * tmp0; 1718 } 1719 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1720 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1721 ibdiag += 4; 1722 break; 1723 case 3: 1724 v2 = a->a + ii[row+1]; 1725 v3 = a->a + ii[row+2]; 1726 sum1 = b[row]; 1727 sum2 = b[row+1]; 1728 sum3 = b[row+2]; 1729 for(n = 0; n<sz-1; n+=2) { 1730 i1 = idx[0]; 1731 i2 = idx[1]; 1732 idx += 2; 1733 tmp0 = x[i1]; 1734 tmp1 = x[i2]; 1735 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1736 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1737 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1738 } 1739 1740 if (n == sz-1){ 1741 tmp0 = x[*idx]; 1742 sum1 -= v1[0] * tmp0; 1743 sum2 -= v2[0] * tmp0; 1744 sum3 -= v3[0] * tmp0; 1745 } 1746 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1747 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1748 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1749 ibdiag += 9; 1750 break; 1751 case 4: 1752 v2 = a->a + ii[row+1]; 1753 v3 = a->a + ii[row+2]; 1754 v4 = a->a + ii[row+3]; 1755 sum1 = b[row]; 1756 sum2 = b[row+1]; 1757 sum3 = b[row+2]; 1758 sum4 = b[row+3]; 1759 for(n = 0; n<sz-1; n+=2) { 1760 i1 = idx[0]; 1761 i2 = idx[1]; 1762 idx += 2; 1763 tmp0 = x[i1]; 1764 tmp1 = x[i2]; 1765 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1766 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1767 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1768 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1769 } 1770 1771 if (n == sz-1){ 1772 tmp0 = x[*idx]; 1773 sum1 -= v1[0] * tmp0; 1774 sum2 -= v2[0] * tmp0; 1775 sum3 -= v3[0] * tmp0; 1776 sum4 -= v4[0] * tmp0; 1777 } 1778 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1779 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1780 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1781 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1782 ibdiag += 16; 1783 break; 1784 case 5: 1785 v2 = a->a + ii[row+1]; 1786 v3 = a->a + ii[row+2]; 1787 v4 = a->a + ii[row+3]; 1788 v5 = a->a + ii[row+4]; 1789 sum1 = b[row]; 1790 sum2 = b[row+1]; 1791 sum3 = b[row+2]; 1792 sum4 = b[row+3]; 1793 sum5 = b[row+4]; 1794 for(n = 0; n<sz-1; n+=2) { 1795 i1 = idx[0]; 1796 i2 = idx[1]; 1797 idx += 2; 1798 tmp0 = x[i1]; 1799 tmp1 = x[i2]; 1800 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1801 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1802 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1803 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1804 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1805 } 1806 1807 if (n == sz-1){ 1808 tmp0 = x[*idx]; 1809 sum1 -= v1[0] * tmp0; 1810 sum2 -= v2[0] * tmp0; 1811 sum3 -= v3[0] * tmp0; 1812 sum4 -= v4[0] * tmp0; 1813 sum5 -= v5[0] * tmp0; 1814 } 1815 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1816 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1817 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1818 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1819 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1820 ibdiag += 25; 1821 break; 1822 default: 1823 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1824 } 1825 } 1826 1827 xb = x; 1828 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1829 } else xb = b; 1830 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1831 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1832 cnt = 0; 1833 for (i=0, row=0; i<m; i++) { 1834 1835 switch (sizes[i]){ 1836 case 1: 1837 x[row++] *= bdiag[cnt++]; 1838 break; 1839 case 2: 1840 x1 = x[row]; x2 = x[row+1]; 1841 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1842 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1843 x[row++] = tmp1; 1844 x[row++] = tmp2; 1845 cnt += 4; 1846 break; 1847 case 3: 1848 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1849 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1850 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1851 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1852 x[row++] = tmp1; 1853 x[row++] = tmp2; 1854 x[row++] = tmp3; 1855 cnt += 9; 1856 break; 1857 case 4: 1858 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1859 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1860 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1861 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1862 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1863 x[row++] = tmp1; 1864 x[row++] = tmp2; 1865 x[row++] = tmp3; 1866 x[row++] = tmp4; 1867 cnt += 16; 1868 break; 1869 case 5: 1870 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1871 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1872 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1873 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1874 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1875 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1876 x[row++] = tmp1; 1877 x[row++] = tmp2; 1878 x[row++] = tmp3; 1879 x[row++] = tmp4; 1880 x[row++] = tmp5; 1881 cnt += 25; 1882 break; 1883 default: 1884 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1885 } 1886 } 1887 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1888 } 1889 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1890 1891 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1892 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1893 ibdiag -= sizes[i]*sizes[i]; 1894 sz = ii[row+1] - diag[row] - 1; 1895 v1 = a->a + diag[row] + 1; 1896 idx = a->j + diag[row] + 1; 1897 1898 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1899 switch (sizes[i]){ 1900 case 1: 1901 1902 sum1 = xb[row]; 1903 for(n = 0; n<sz-1; n+=2) { 1904 i1 = idx[0]; 1905 i2 = idx[1]; 1906 idx += 2; 1907 tmp0 = x[i1]; 1908 tmp1 = x[i2]; 1909 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1910 } 1911 1912 if (n == sz-1){ 1913 tmp0 = x[*idx]; 1914 sum1 -= *v1*tmp0; 1915 } 1916 x[row--] = sum1*(*ibdiag); 1917 break; 1918 1919 case 2: 1920 1921 sum1 = xb[row]; 1922 sum2 = xb[row-1]; 1923 /* note that sum1 is associated with the second of the two rows */ 1924 v2 = a->a + diag[row-1] + 2; 1925 for(n = 0; n<sz-1; n+=2) { 1926 i1 = idx[0]; 1927 i2 = idx[1]; 1928 idx += 2; 1929 tmp0 = x[i1]; 1930 tmp1 = x[i2]; 1931 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1932 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1933 } 1934 1935 if (n == sz-1){ 1936 tmp0 = x[*idx]; 1937 sum1 -= *v1*tmp0; 1938 sum2 -= *v2*tmp0; 1939 } 1940 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1941 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1942 break; 1943 case 3: 1944 1945 sum1 = xb[row]; 1946 sum2 = xb[row-1]; 1947 sum3 = xb[row-2]; 1948 v2 = a->a + diag[row-1] + 2; 1949 v3 = a->a + diag[row-2] + 3; 1950 for(n = 0; n<sz-1; n+=2) { 1951 i1 = idx[0]; 1952 i2 = idx[1]; 1953 idx += 2; 1954 tmp0 = x[i1]; 1955 tmp1 = x[i2]; 1956 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1957 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1958 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1959 } 1960 1961 if (n == sz-1){ 1962 tmp0 = x[*idx]; 1963 sum1 -= *v1*tmp0; 1964 sum2 -= *v2*tmp0; 1965 sum3 -= *v3*tmp0; 1966 } 1967 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1968 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1969 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1970 break; 1971 case 4: 1972 1973 sum1 = xb[row]; 1974 sum2 = xb[row-1]; 1975 sum3 = xb[row-2]; 1976 sum4 = xb[row-3]; 1977 v2 = a->a + diag[row-1] + 2; 1978 v3 = a->a + diag[row-2] + 3; 1979 v4 = a->a + diag[row-3] + 4; 1980 for(n = 0; n<sz-1; n+=2) { 1981 i1 = idx[0]; 1982 i2 = idx[1]; 1983 idx += 2; 1984 tmp0 = x[i1]; 1985 tmp1 = x[i2]; 1986 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1987 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1988 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1989 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1990 } 1991 1992 if (n == sz-1){ 1993 tmp0 = x[*idx]; 1994 sum1 -= *v1*tmp0; 1995 sum2 -= *v2*tmp0; 1996 sum3 -= *v3*tmp0; 1997 sum4 -= *v4*tmp0; 1998 } 1999 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2000 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2001 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2002 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2003 break; 2004 case 5: 2005 2006 sum1 = xb[row]; 2007 sum2 = xb[row-1]; 2008 sum3 = xb[row-2]; 2009 sum4 = xb[row-3]; 2010 sum5 = xb[row-4]; 2011 v2 = a->a + diag[row-1] + 2; 2012 v3 = a->a + diag[row-2] + 3; 2013 v4 = a->a + diag[row-3] + 4; 2014 v5 = a->a + diag[row-4] + 5; 2015 for(n = 0; n<sz-1; n+=2) { 2016 i1 = idx[0]; 2017 i2 = idx[1]; 2018 idx += 2; 2019 tmp0 = x[i1]; 2020 tmp1 = x[i2]; 2021 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2022 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2023 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2024 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2025 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2026 } 2027 2028 if (n == sz-1){ 2029 tmp0 = x[*idx]; 2030 sum1 -= *v1*tmp0; 2031 sum2 -= *v2*tmp0; 2032 sum3 -= *v3*tmp0; 2033 sum4 -= *v4*tmp0; 2034 sum5 -= *v5*tmp0; 2035 } 2036 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2037 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2038 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2039 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2040 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2041 break; 2042 default: 2043 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2044 } 2045 } 2046 2047 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2048 } 2049 its--; 2050 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2051 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2052 } 2053 if (flag & SOR_EISENSTAT) { 2054 const PetscScalar *b; 2055 MatScalar *t = a->inode.ssor_work; 2056 2057 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2058 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2059 /* 2060 Apply (U + D)^-1 where D is now the block diagonal 2061 */ 2062 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2063 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2064 ibdiag -= sizes[i]*sizes[i]; 2065 sz = ii[row+1] - diag[row] - 1; 2066 v1 = a->a + diag[row] + 1; 2067 idx = a->j + diag[row] + 1; 2068 CHKMEMQ; 2069 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2070 switch (sizes[i]){ 2071 case 1: 2072 2073 sum1 = b[row]; 2074 for(n = 0; n<sz-1; n+=2) { 2075 i1 = idx[0]; 2076 i2 = idx[1]; 2077 idx += 2; 2078 tmp0 = x[i1]; 2079 tmp1 = x[i2]; 2080 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2081 } 2082 2083 if (n == sz-1){ 2084 tmp0 = x[*idx]; 2085 sum1 -= *v1*tmp0; 2086 } 2087 x[row] = sum1*(*ibdiag);row--; 2088 break; 2089 2090 case 2: 2091 2092 sum1 = b[row]; 2093 sum2 = b[row-1]; 2094 /* note that sum1 is associated with the second of the two rows */ 2095 v2 = a->a + diag[row-1] + 2; 2096 for(n = 0; n<sz-1; n+=2) { 2097 i1 = idx[0]; 2098 i2 = idx[1]; 2099 idx += 2; 2100 tmp0 = x[i1]; 2101 tmp1 = x[i2]; 2102 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2103 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2104 } 2105 2106 if (n == sz-1){ 2107 tmp0 = x[*idx]; 2108 sum1 -= *v1*tmp0; 2109 sum2 -= *v2*tmp0; 2110 } 2111 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2112 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2113 row -= 2; 2114 break; 2115 case 3: 2116 2117 sum1 = b[row]; 2118 sum2 = b[row-1]; 2119 sum3 = b[row-2]; 2120 v2 = a->a + diag[row-1] + 2; 2121 v3 = a->a + diag[row-2] + 3; 2122 for(n = 0; n<sz-1; n+=2) { 2123 i1 = idx[0]; 2124 i2 = idx[1]; 2125 idx += 2; 2126 tmp0 = x[i1]; 2127 tmp1 = x[i2]; 2128 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2129 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2130 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2131 } 2132 2133 if (n == sz-1){ 2134 tmp0 = x[*idx]; 2135 sum1 -= *v1*tmp0; 2136 sum2 -= *v2*tmp0; 2137 sum3 -= *v3*tmp0; 2138 } 2139 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2140 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2141 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2142 row -= 3; 2143 break; 2144 case 4: 2145 2146 sum1 = b[row]; 2147 sum2 = b[row-1]; 2148 sum3 = b[row-2]; 2149 sum4 = b[row-3]; 2150 v2 = a->a + diag[row-1] + 2; 2151 v3 = a->a + diag[row-2] + 3; 2152 v4 = a->a + diag[row-3] + 4; 2153 for(n = 0; n<sz-1; n+=2) { 2154 i1 = idx[0]; 2155 i2 = idx[1]; 2156 idx += 2; 2157 tmp0 = x[i1]; 2158 tmp1 = x[i2]; 2159 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2160 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2161 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2162 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2163 } 2164 2165 if (n == sz-1){ 2166 tmp0 = x[*idx]; 2167 sum1 -= *v1*tmp0; 2168 sum2 -= *v2*tmp0; 2169 sum3 -= *v3*tmp0; 2170 sum4 -= *v4*tmp0; 2171 } 2172 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2173 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2174 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2175 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2176 row -= 4; 2177 break; 2178 case 5: 2179 2180 sum1 = b[row]; 2181 sum2 = b[row-1]; 2182 sum3 = b[row-2]; 2183 sum4 = b[row-3]; 2184 sum5 = b[row-4]; 2185 v2 = a->a + diag[row-1] + 2; 2186 v3 = a->a + diag[row-2] + 3; 2187 v4 = a->a + diag[row-3] + 4; 2188 v5 = a->a + diag[row-4] + 5; 2189 for(n = 0; n<sz-1; n+=2) { 2190 i1 = idx[0]; 2191 i2 = idx[1]; 2192 idx += 2; 2193 tmp0 = x[i1]; 2194 tmp1 = x[i2]; 2195 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2196 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2197 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2198 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2199 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2200 } 2201 2202 if (n == sz-1){ 2203 tmp0 = x[*idx]; 2204 sum1 -= *v1*tmp0; 2205 sum2 -= *v2*tmp0; 2206 sum3 -= *v3*tmp0; 2207 sum4 -= *v4*tmp0; 2208 sum5 -= *v5*tmp0; 2209 } 2210 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2211 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2212 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2213 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2214 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2215 row -= 5; 2216 break; 2217 default: 2218 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2219 } 2220 CHKMEMQ; 2221 } 2222 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2223 2224 /* 2225 t = b - D x where D is the block diagonal 2226 */ 2227 cnt = 0; 2228 for (i=0, row=0; i<m; i++) { 2229 CHKMEMQ; 2230 switch (sizes[i]){ 2231 case 1: 2232 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2233 break; 2234 case 2: 2235 x1 = x[row]; x2 = x[row+1]; 2236 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2237 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2238 t[row] = b[row] - tmp1; 2239 t[row+1] = b[row+1] - tmp2; row += 2; 2240 cnt += 4; 2241 break; 2242 case 3: 2243 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2244 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2245 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2246 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2247 t[row] = b[row] - tmp1; 2248 t[row+1] = b[row+1] - tmp2; 2249 t[row+2] = b[row+2] - tmp3; row += 3; 2250 cnt += 9; 2251 break; 2252 case 4: 2253 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2254 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2255 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2256 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2257 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2258 t[row] = b[row] - tmp1; 2259 t[row+1] = b[row+1] - tmp2; 2260 t[row+2] = b[row+2] - tmp3; 2261 t[row+3] = b[row+3] - tmp4; row += 4; 2262 cnt += 16; 2263 break; 2264 case 5: 2265 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2266 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2267 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2268 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2269 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2270 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2271 t[row] = b[row] - tmp1; 2272 t[row+1] = b[row+1] - tmp2; 2273 t[row+2] = b[row+2] - tmp3; 2274 t[row+3] = b[row+3] - tmp4; 2275 t[row+4] = b[row+4] - tmp5;row += 5; 2276 cnt += 25; 2277 break; 2278 default: 2279 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2280 } 2281 CHKMEMQ; 2282 } 2283 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2284 2285 2286 2287 /* 2288 Apply (L + D)^-1 where D is the block diagonal 2289 */ 2290 for (i=0, row=0; i<m; i++) { 2291 sz = diag[row] - ii[row]; 2292 v1 = a->a + ii[row]; 2293 idx = a->j + ii[row]; 2294 CHKMEMQ; 2295 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2296 switch (sizes[i]){ 2297 case 1: 2298 2299 sum1 = t[row]; 2300 for(n = 0; n<sz-1; n+=2) { 2301 i1 = idx[0]; 2302 i2 = idx[1]; 2303 idx += 2; 2304 tmp0 = t[i1]; 2305 tmp1 = t[i2]; 2306 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2307 } 2308 2309 if (n == sz-1){ 2310 tmp0 = t[*idx]; 2311 sum1 -= *v1 * tmp0; 2312 } 2313 x[row] += t[row] = sum1*(*ibdiag++); row++; 2314 break; 2315 case 2: 2316 v2 = a->a + ii[row+1]; 2317 sum1 = t[row]; 2318 sum2 = t[row+1]; 2319 for(n = 0; n<sz-1; n+=2) { 2320 i1 = idx[0]; 2321 i2 = idx[1]; 2322 idx += 2; 2323 tmp0 = t[i1]; 2324 tmp1 = t[i2]; 2325 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2326 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2327 } 2328 2329 if (n == sz-1){ 2330 tmp0 = t[*idx]; 2331 sum1 -= v1[0] * tmp0; 2332 sum2 -= v2[0] * tmp0; 2333 } 2334 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2335 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2336 ibdiag += 4; row += 2; 2337 break; 2338 case 3: 2339 v2 = a->a + ii[row+1]; 2340 v3 = a->a + ii[row+2]; 2341 sum1 = t[row]; 2342 sum2 = t[row+1]; 2343 sum3 = t[row+2]; 2344 for(n = 0; n<sz-1; n+=2) { 2345 i1 = idx[0]; 2346 i2 = idx[1]; 2347 idx += 2; 2348 tmp0 = t[i1]; 2349 tmp1 = t[i2]; 2350 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2351 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2352 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2353 } 2354 2355 if (n == sz-1){ 2356 tmp0 = t[*idx]; 2357 sum1 -= v1[0] * tmp0; 2358 sum2 -= v2[0] * tmp0; 2359 sum3 -= v3[0] * tmp0; 2360 } 2361 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2362 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2363 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2364 ibdiag += 9; row += 3; 2365 break; 2366 case 4: 2367 v2 = a->a + ii[row+1]; 2368 v3 = a->a + ii[row+2]; 2369 v4 = a->a + ii[row+3]; 2370 sum1 = t[row]; 2371 sum2 = t[row+1]; 2372 sum3 = t[row+2]; 2373 sum4 = t[row+3]; 2374 for(n = 0; n<sz-1; n+=2) { 2375 i1 = idx[0]; 2376 i2 = idx[1]; 2377 idx += 2; 2378 tmp0 = t[i1]; 2379 tmp1 = t[i2]; 2380 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2381 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2382 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2383 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2384 } 2385 2386 if (n == sz-1){ 2387 tmp0 = t[*idx]; 2388 sum1 -= v1[0] * tmp0; 2389 sum2 -= v2[0] * tmp0; 2390 sum3 -= v3[0] * tmp0; 2391 sum4 -= v4[0] * tmp0; 2392 } 2393 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2394 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2395 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2396 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2397 ibdiag += 16; row += 4; 2398 break; 2399 case 5: 2400 v2 = a->a + ii[row+1]; 2401 v3 = a->a + ii[row+2]; 2402 v4 = a->a + ii[row+3]; 2403 v5 = a->a + ii[row+4]; 2404 sum1 = t[row]; 2405 sum2 = t[row+1]; 2406 sum3 = t[row+2]; 2407 sum4 = t[row+3]; 2408 sum5 = t[row+4]; 2409 for(n = 0; n<sz-1; n+=2) { 2410 i1 = idx[0]; 2411 i2 = idx[1]; 2412 idx += 2; 2413 tmp0 = t[i1]; 2414 tmp1 = t[i2]; 2415 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2416 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2417 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2418 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2419 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2420 } 2421 2422 if (n == sz-1){ 2423 tmp0 = t[*idx]; 2424 sum1 -= v1[0] * tmp0; 2425 sum2 -= v2[0] * tmp0; 2426 sum3 -= v3[0] * tmp0; 2427 sum4 -= v4[0] * tmp0; 2428 sum5 -= v5[0] * tmp0; 2429 } 2430 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2431 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2432 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2433 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2434 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2435 ibdiag += 25; row += 5; 2436 break; 2437 default: 2438 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2439 } 2440 CHKMEMQ; 2441 } 2442 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2444 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2445 } 2446 PetscFunctionReturn(0); 2447 } 2448 2449 #undef __FUNCT__ 2450 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 2451 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2452 { 2453 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2454 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2455 const MatScalar *bdiag = a->inode.bdiag; 2456 const PetscScalar *b; 2457 PetscErrorCode ierr; 2458 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2459 const PetscInt *sizes = a->inode.size; 2460 2461 PetscFunctionBegin; 2462 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2463 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2464 cnt = 0; 2465 for (i=0, row=0; i<m; i++) { 2466 switch (sizes[i]){ 2467 case 1: 2468 x[row] = b[row]*bdiag[cnt++];row++; 2469 break; 2470 case 2: 2471 x1 = b[row]; x2 = b[row+1]; 2472 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2473 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2474 x[row++] = tmp1; 2475 x[row++] = tmp2; 2476 cnt += 4; 2477 break; 2478 case 3: 2479 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2480 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2481 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2482 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2483 x[row++] = tmp1; 2484 x[row++] = tmp2; 2485 x[row++] = tmp3; 2486 cnt += 9; 2487 break; 2488 case 4: 2489 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2490 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2491 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2492 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2493 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2494 x[row++] = tmp1; 2495 x[row++] = tmp2; 2496 x[row++] = tmp3; 2497 x[row++] = tmp4; 2498 cnt += 16; 2499 break; 2500 case 5: 2501 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2502 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2503 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2504 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2505 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2506 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2507 x[row++] = tmp1; 2508 x[row++] = tmp2; 2509 x[row++] = tmp3; 2510 x[row++] = tmp4; 2511 x[row++] = tmp5; 2512 cnt += 25; 2513 break; 2514 default: 2515 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2516 } 2517 } 2518 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2519 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2520 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2521 PetscFunctionReturn(0); 2522 } 2523 2524 /* 2525 samestructure indicates that the matrix has not changed its nonzero structure so we 2526 do not need to recompute the inodes 2527 */ 2528 #undef __FUNCT__ 2529 #define __FUNCT__ "Mat_CheckInode" 2530 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2531 { 2532 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2533 PetscErrorCode ierr; 2534 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2535 PetscTruth flag; 2536 2537 PetscFunctionBegin; 2538 if (!a->inode.use) PetscFunctionReturn(0); 2539 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2540 2541 2542 m = A->rmap->n; 2543 if (a->inode.size) {ns = a->inode.size;} 2544 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2545 2546 i = 0; 2547 node_count = 0; 2548 idx = a->j; 2549 ii = a->i; 2550 while (i < m){ /* For each row */ 2551 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2552 /* Limits the number of elements in a node to 'a->inode.limit' */ 2553 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2554 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2555 if (nzy != nzx) break; 2556 idy += nzx; /* Same nonzero pattern */ 2557 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2558 if (!flag) break; 2559 } 2560 ns[node_count++] = blk_size; 2561 idx += blk_size*nzx; 2562 i = j; 2563 } 2564 /* If not enough inodes found,, do not use inode version of the routines */ 2565 if (!a->inode.size && m && node_count > .9*m) { 2566 ierr = PetscFree(ns);CHKERRQ(ierr); 2567 a->inode.node_count = 0; 2568 a->inode.size = PETSC_NULL; 2569 a->inode.use = PETSC_FALSE; 2570 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2571 } else { 2572 A->ops->mult = MatMult_SeqAIJ_Inode; 2573 A->ops->sor = MatSOR_SeqAIJ_Inode; 2574 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 2575 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 2576 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 2577 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 2578 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 2579 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 2580 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 2581 a->inode.node_count = node_count; 2582 a->inode.size = ns; 2583 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2584 } 2585 PetscFunctionReturn(0); 2586 } 2587 2588 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 2589 PetscInt __k, *__vi; \ 2590 __vi = aj + ai[row]; \ 2591 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 2592 __vi = aj + adiag[row]; \ 2593 cols[nzl] = __vi[0];\ 2594 __vi = aj + adiag[row+1]+1;\ 2595 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 2596 2597 #undef __FUNCT__ 2598 #define __FUNCT__ "Mat_CheckInode_FactorLU" 2599 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 2600 { 2601 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2602 PetscErrorCode ierr; 2603 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 2604 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 2605 PetscTruth flag; 2606 2607 PetscFunctionBegin; 2608 if (!a->inode.use) PetscFunctionReturn(0); 2609 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2610 2611 m = A->rmap->n; 2612 if (a->inode.size) {ns = a->inode.size;} 2613 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2614 2615 i = 0; 2616 node_count = 0; 2617 while (i < m){ /* For each row */ 2618 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 2619 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 2620 nzx = nzl1 + nzu1 + 1; 2621 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 2622 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 2623 2624 /* Limits the number of elements in a node to 'a->inode.limit' */ 2625 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2626 nzl2 = ai[j+1] - ai[j]; 2627 nzu2 = adiag[j] - adiag[j+1] - 1; 2628 nzy = nzl2 + nzu2 + 1; 2629 if( nzy != nzx) break; 2630 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 2631 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 2632 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2633 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 2634 ierr = PetscFree(cols2);CHKERRQ(ierr); 2635 } 2636 ns[node_count++] = blk_size; 2637 ierr = PetscFree(cols1);CHKERRQ(ierr); 2638 i = j; 2639 } 2640 /* If not enough inodes found,, do not use inode version of the routines */ 2641 if (!a->inode.size && m && node_count > .9*m) { 2642 ierr = PetscFree(ns);CHKERRQ(ierr); 2643 a->inode.node_count = 0; 2644 a->inode.size = PETSC_NULL; 2645 a->inode.use = PETSC_FALSE; 2646 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2647 } else { 2648 A->ops->mult = 0; 2649 A->ops->sor = 0; 2650 A->ops->multadd = 0; 2651 A->ops->getrowij = 0; 2652 A->ops->restorerowij = 0; 2653 A->ops->getcolumnij = 0; 2654 A->ops->restorecolumnij = 0; 2655 A->ops->coloringpatch = 0; 2656 A->ops->multdiagonalblock = 0; 2657 a->inode.node_count = node_count; 2658 a->inode.size = ns; 2659 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2660 } 2661 PetscFunctionReturn(0); 2662 } 2663 2664 /* 2665 This is really ugly. if inodes are used this replaces the 2666 permutations with ones that correspond to rows/cols of the matrix 2667 rather then inode blocks 2668 */ 2669 #undef __FUNCT__ 2670 #define __FUNCT__ "MatInodeAdjustForInodes" 2671 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2672 { 2673 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2674 2675 PetscFunctionBegin; 2676 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2677 if (f) { 2678 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2679 } 2680 PetscFunctionReturn(0); 2681 } 2682 2683 EXTERN_C_BEGIN 2684 #undef __FUNCT__ 2685 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 2686 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 2687 { 2688 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2689 PetscErrorCode ierr; 2690 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2691 const PetscInt *ridx,*cidx; 2692 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2693 PetscInt nslim_col,*ns_col; 2694 IS ris = *rperm,cis = *cperm; 2695 2696 PetscFunctionBegin; 2697 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2698 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2699 2700 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2701 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2702 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 2703 2704 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2705 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2706 2707 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2708 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2709 2710 /* Construct the permutations for rows*/ 2711 for (i=0,row = 0; i<nslim_row; ++i){ 2712 indx = ridx[i]; 2713 start_val = tns[indx]; 2714 end_val = tns[indx + 1]; 2715 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2716 } 2717 2718 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2719 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2720 2721 /* Construct permutations for columns */ 2722 for (i=0,col=0; i<nslim_col; ++i){ 2723 indx = cidx[i]; 2724 start_val = tns[indx]; 2725 end_val = tns[indx + 1]; 2726 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2727 } 2728 2729 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2730 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 2731 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2732 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 2733 2734 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2735 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2736 2737 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2738 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 2739 ierr = ISDestroy(cis);CHKERRQ(ierr); 2740 ierr = ISDestroy(ris);CHKERRQ(ierr); 2741 ierr = PetscFree(tns);CHKERRQ(ierr); 2742 PetscFunctionReturn(0); 2743 } 2744 EXTERN_C_END 2745 2746 #undef __FUNCT__ 2747 #define __FUNCT__ "MatInodeGetInodeSizes" 2748 /*@C 2749 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2750 2751 Collective on Mat 2752 2753 Input Parameter: 2754 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2755 2756 Output Parameter: 2757 + node_count - no of inodes present in the matrix. 2758 . sizes - an array of size node_count,with sizes of each inode. 2759 - limit - the max size used to generate the inodes. 2760 2761 Level: advanced 2762 2763 Notes: This routine returns some internal storage information 2764 of the matrix, it is intended to be used by advanced users. 2765 It should be called after the matrix is assembled. 2766 The contents of the sizes[] array should not be changed. 2767 PETSC_NULL may be passed for information not requested. 2768 2769 .keywords: matrix, seqaij, get, inode 2770 2771 .seealso: MatGetInfo() 2772 @*/ 2773 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2774 { 2775 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2776 2777 PetscFunctionBegin; 2778 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2779 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2780 if (f) { 2781 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2782 } 2783 PetscFunctionReturn(0); 2784 } 2785 2786 EXTERN_C_BEGIN 2787 #undef __FUNCT__ 2788 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 2789 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2790 { 2791 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2792 2793 PetscFunctionBegin; 2794 if (node_count) *node_count = a->inode.node_count; 2795 if (sizes) *sizes = a->inode.size; 2796 if (limit) *limit = a->inode.limit; 2797 PetscFunctionReturn(0); 2798 } 2799 EXTERN_C_END 2800