1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0.; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0.; 449 sum2 = 0.; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0.; 473 sum2 = 0.; 474 sum3 = 0.; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0.; 502 sum2 = 0.; 503 sum3 = 0.; 504 sum4 = 0.; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0.; 536 sum2 = 0.; 537 sum3 = 0.; 538 sum4 = 0.; 539 sum5 = 0.; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 779 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 const PetscInt *r,*c,*rout,*cout; 785 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 786 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 for(j=0; j<nz-1; j+=2){ 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if(j == nz-1){ 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row ++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for(j=0; j<nz-1; j+=2){ 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if(j == nz-1){ 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ * sum1; 854 tmp[row ++]=sum1; 855 tmp[row ++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2){ 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1){ 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 tmp[row ++]=sum1; 884 tmp[row ++]=sum2; 885 tmp[row ++]=sum3; 886 break; 887 888 case 4: 889 sum1 = b[*r++]; 890 sum2 = b[*r++]; 891 sum3 = b[*r++]; 892 sum4 = b[*r++]; 893 v2 = aa + ai[row+1]; 894 v3 = aa + ai[row+2]; 895 v4 = aa + ai[row+3]; 896 897 for (j=0; j<nz-1; j+=2){ 898 i0 = vi[0]; 899 i1 = vi[1]; 900 vi +=2; 901 tmp0 = tmps[i0]; 902 tmp1 = tmps[i1]; 903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 906 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 907 } 908 if (j == nz-1){ 909 tmp0 = tmps[*vi++]; 910 sum1 -= *v1++ *tmp0; 911 sum2 -= *v2++ *tmp0; 912 sum3 -= *v3++ *tmp0; 913 sum4 -= *v4++ *tmp0; 914 } 915 sum2 -= *v2++ * sum1; 916 sum3 -= *v3++ * sum1; 917 sum4 -= *v4++ * sum1; 918 sum3 -= *v3++ * sum2; 919 sum4 -= *v4++ * sum2; 920 sum4 -= *v4++ * sum3; 921 922 tmp[row ++]=sum1; 923 tmp[row ++]=sum2; 924 tmp[row ++]=sum3; 925 tmp[row ++]=sum4; 926 break; 927 case 5: 928 sum1 = b[*r++]; 929 sum2 = b[*r++]; 930 sum3 = b[*r++]; 931 sum4 = b[*r++]; 932 sum5 = b[*r++]; 933 v2 = aa + ai[row+1]; 934 v3 = aa + ai[row+2]; 935 v4 = aa + ai[row+3]; 936 v5 = aa + ai[row+4]; 937 938 for (j=0; j<nz-1; j+=2){ 939 i0 = vi[0]; 940 i1 = vi[1]; 941 vi +=2; 942 tmp0 = tmps[i0]; 943 tmp1 = tmps[i1]; 944 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 945 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 946 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 947 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 948 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 949 } 950 if (j == nz-1){ 951 tmp0 = tmps[*vi++]; 952 sum1 -= *v1++ *tmp0; 953 sum2 -= *v2++ *tmp0; 954 sum3 -= *v3++ *tmp0; 955 sum4 -= *v4++ *tmp0; 956 sum5 -= *v5++ *tmp0; 957 } 958 959 sum2 -= *v2++ * sum1; 960 sum3 -= *v3++ * sum1; 961 sum4 -= *v4++ * sum1; 962 sum5 -= *v5++ * sum1; 963 sum3 -= *v3++ * sum2; 964 sum4 -= *v4++ * sum2; 965 sum5 -= *v5++ * sum2; 966 sum4 -= *v4++ * sum3; 967 sum5 -= *v5++ * sum3; 968 sum5 -= *v5++ * sum4; 969 970 tmp[row ++]=sum1; 971 tmp[row ++]=sum2; 972 tmp[row ++]=sum3; 973 tmp[row ++]=sum4; 974 tmp[row ++]=sum5; 975 break; 976 default: 977 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 978 } 979 } 980 /* backward solve the upper triangular */ 981 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 982 nsz = ns[i]; 983 aii = ai[row+1] -1; 984 v1 = aa + aii; 985 vi = aj + aii; 986 nz = aii- ad[row]; 987 switch (nsz){ /* Each loop in 'case' is unrolled */ 988 case 1 : 989 sum1 = tmp[row]; 990 991 for(j=nz ; j>1; j-=2){ 992 vi -=2; 993 i0 = vi[2]; 994 i1 = vi[1]; 995 tmp0 = tmps[i0]; 996 tmp1 = tmps[i1]; 997 v1 -= 2; 998 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 999 } 1000 if (j==1){ 1001 tmp0 = tmps[*vi--]; 1002 sum1 -= *v1-- * tmp0; 1003 } 1004 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1005 break; 1006 case 2 : 1007 sum1 = tmp[row]; 1008 sum2 = tmp[row -1]; 1009 v2 = aa + ai[row]-1; 1010 for (j=nz ; j>1; j-=2){ 1011 vi -=2; 1012 i0 = vi[2]; 1013 i1 = vi[1]; 1014 tmp0 = tmps[i0]; 1015 tmp1 = tmps[i1]; 1016 v1 -= 2; 1017 v2 -= 2; 1018 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1019 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1020 } 1021 if (j==1){ 1022 tmp0 = tmps[*vi--]; 1023 sum1 -= *v1-- * tmp0; 1024 sum2 -= *v2-- * tmp0; 1025 } 1026 1027 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1028 sum2 -= *v2-- * tmp0; 1029 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1030 break; 1031 case 3 : 1032 sum1 = tmp[row]; 1033 sum2 = tmp[row -1]; 1034 sum3 = tmp[row -2]; 1035 v2 = aa + ai[row]-1; 1036 v3 = aa + ai[row -1]-1; 1037 for (j=nz ; j>1; j-=2){ 1038 vi -=2; 1039 i0 = vi[2]; 1040 i1 = vi[1]; 1041 tmp0 = tmps[i0]; 1042 tmp1 = tmps[i1]; 1043 v1 -= 2; 1044 v2 -= 2; 1045 v3 -= 2; 1046 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1047 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1048 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1049 } 1050 if (j==1){ 1051 tmp0 = tmps[*vi--]; 1052 sum1 -= *v1-- * tmp0; 1053 sum2 -= *v2-- * tmp0; 1054 sum3 -= *v3-- * tmp0; 1055 } 1056 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1057 sum2 -= *v2-- * tmp0; 1058 sum3 -= *v3-- * tmp0; 1059 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1060 sum3 -= *v3-- * tmp0; 1061 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1062 1063 break; 1064 case 4 : 1065 sum1 = tmp[row]; 1066 sum2 = tmp[row -1]; 1067 sum3 = tmp[row -2]; 1068 sum4 = tmp[row -3]; 1069 v2 = aa + ai[row]-1; 1070 v3 = aa + ai[row -1]-1; 1071 v4 = aa + ai[row -2]-1; 1072 1073 for (j=nz ; j>1; j-=2){ 1074 vi -=2; 1075 i0 = vi[2]; 1076 i1 = vi[1]; 1077 tmp0 = tmps[i0]; 1078 tmp1 = tmps[i1]; 1079 v1 -= 2; 1080 v2 -= 2; 1081 v3 -= 2; 1082 v4 -= 2; 1083 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1084 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1085 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1086 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1087 } 1088 if (j==1){ 1089 tmp0 = tmps[*vi--]; 1090 sum1 -= *v1-- * tmp0; 1091 sum2 -= *v2-- * tmp0; 1092 sum3 -= *v3-- * tmp0; 1093 sum4 -= *v4-- * tmp0; 1094 } 1095 1096 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1097 sum2 -= *v2-- * tmp0; 1098 sum3 -= *v3-- * tmp0; 1099 sum4 -= *v4-- * tmp0; 1100 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1104 sum4 -= *v4-- * tmp0; 1105 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1106 break; 1107 case 5 : 1108 sum1 = tmp[row]; 1109 sum2 = tmp[row -1]; 1110 sum3 = tmp[row -2]; 1111 sum4 = tmp[row -3]; 1112 sum5 = tmp[row -4]; 1113 v2 = aa + ai[row]-1; 1114 v3 = aa + ai[row -1]-1; 1115 v4 = aa + ai[row -2]-1; 1116 v5 = aa + ai[row -3]-1; 1117 for (j=nz ; j>1; j-=2){ 1118 vi -= 2; 1119 i0 = vi[2]; 1120 i1 = vi[1]; 1121 tmp0 = tmps[i0]; 1122 tmp1 = tmps[i1]; 1123 v1 -= 2; 1124 v2 -= 2; 1125 v3 -= 2; 1126 v4 -= 2; 1127 v5 -= 2; 1128 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1129 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1130 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1131 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1132 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1133 } 1134 if (j==1){ 1135 tmp0 = tmps[*vi--]; 1136 sum1 -= *v1-- * tmp0; 1137 sum2 -= *v2-- * tmp0; 1138 sum3 -= *v3-- * tmp0; 1139 sum4 -= *v4-- * tmp0; 1140 sum5 -= *v5-- * tmp0; 1141 } 1142 1143 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1144 sum2 -= *v2-- * tmp0; 1145 sum3 -= *v3-- * tmp0; 1146 sum4 -= *v4-- * tmp0; 1147 sum5 -= *v5-- * tmp0; 1148 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1149 sum3 -= *v3-- * tmp0; 1150 sum4 -= *v4-- * tmp0; 1151 sum5 -= *v5-- * tmp0; 1152 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1156 sum5 -= *v5-- * tmp0; 1157 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1158 break; 1159 default: 1160 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1161 } 1162 } 1163 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1164 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1165 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1167 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1168 PetscFunctionReturn(0); 1169 } 1170 1171 /* ----------------------------------------------------------- */ 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_newdatastruct" 1174 PetscErrorCode MatSolve_SeqAIJ_Inode_newdatastruct(Mat A,Vec bb,Vec xx) 1175 { 1176 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1177 IS iscol = a->col,isrow = a->row; 1178 PetscErrorCode ierr; 1179 const PetscInt *r,*c,*rout,*cout; 1180 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 1181 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 1182 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 1183 PetscScalar sum1,sum2,sum3,sum4,sum5; 1184 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 1185 const PetscScalar *b; 1186 1187 PetscFunctionBegin; 1188 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 1189 node_max = a->inode.node_count; 1190 ns = a->inode.size; /* Node Size array */ 1191 1192 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1193 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1194 tmp = a->solve_work; 1195 1196 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1197 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1198 1199 /* forward solve the lower triangular */ 1200 tmps = tmp ; 1201 aa = a_a ; 1202 aj = a_j ; 1203 ad = a->diag; 1204 1205 for (i = 0,row = 0; i< node_max; ++i){ 1206 nsz = ns[i]; 1207 aii = ai[row]; 1208 v1 = aa + aii; 1209 vi = aj + aii; 1210 nz = ai[row+1]- ai[row]; 1211 1212 switch (nsz){ /* Each loop in 'case' is unrolled */ 1213 case 1 : 1214 sum1 = b[r[row]]; 1215 for(j=0; j<nz-1; j+=2){ 1216 i0 = vi[j]; 1217 i1 = vi[j+1]; 1218 tmp0 = tmps[i0]; 1219 tmp1 = tmps[i1]; 1220 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 1221 } 1222 if(j == nz-1){ 1223 tmp0 = tmps[vi[j]]; 1224 sum1 -= v1[j]*tmp0; 1225 } 1226 tmp[row++]=sum1; 1227 break; 1228 case 2: 1229 sum1 = b[r[row]]; 1230 sum2 = b[r[row+1]]; 1231 v2 = aa + ai[row+1]; 1232 1233 for(j=0; j<nz-1; j+=2){ 1234 i0 = vi[j]; 1235 i1 = vi[j+1]; 1236 tmp0 = tmps[i0]; 1237 tmp1 = tmps[i1]; 1238 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1239 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1240 } 1241 if(j == nz-1){ 1242 tmp0 = tmps[vi[j]]; 1243 sum1 -= v1[j] *tmp0; 1244 sum2 -= v2[j] *tmp0; 1245 } 1246 sum2 -= v2[nz] * sum1; 1247 tmp[row ++]=sum1; 1248 tmp[row ++]=sum2; 1249 break; 1250 case 3: 1251 sum1 = b[r[row]]; 1252 sum2 = b[r[row+1]]; 1253 sum3 = b[r[row+2]]; 1254 v2 = aa + ai[row+1]; 1255 v3 = aa + ai[row+2]; 1256 1257 for (j=0; j<nz-1; j+=2){ 1258 i0 = vi[j]; 1259 i1 = vi[j+1]; 1260 tmp0 = tmps[i0]; 1261 tmp1 = tmps[i1]; 1262 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1263 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1264 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1265 } 1266 if (j == nz-1){ 1267 tmp0 = tmps[vi[j]]; 1268 sum1 -= v1[j] *tmp0; 1269 sum2 -= v2[j] *tmp0; 1270 sum3 -= v3[j] *tmp0; 1271 } 1272 sum2 -= v2[nz] * sum1; 1273 sum3 -= v3[nz] * sum1; 1274 sum3 -= v3[nz+1] * sum2; 1275 tmp[row ++]=sum1; 1276 tmp[row ++]=sum2; 1277 tmp[row ++]=sum3; 1278 break; 1279 1280 case 4: 1281 sum1 = b[r[row]]; 1282 sum2 = b[r[row+1]]; 1283 sum3 = b[r[row+2]]; 1284 sum4 = b[r[row+3]]; 1285 v2 = aa + ai[row+1]; 1286 v3 = aa + ai[row+2]; 1287 v4 = aa + ai[row+3]; 1288 1289 for (j=0; j<nz-1; j+=2){ 1290 i0 = vi[j]; 1291 i1 = vi[j+1]; 1292 tmp0 = tmps[i0]; 1293 tmp1 = tmps[i1]; 1294 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1295 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1296 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1297 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1298 } 1299 if (j == nz-1){ 1300 tmp0 = tmps[vi[j]]; 1301 sum1 -= v1[j] *tmp0; 1302 sum2 -= v2[j] *tmp0; 1303 sum3 -= v3[j] *tmp0; 1304 sum4 -= v4[j] *tmp0; 1305 } 1306 sum2 -= v2[nz] * sum1; 1307 sum3 -= v3[nz] * sum1; 1308 sum4 -= v4[nz] * sum1; 1309 sum3 -= v3[nz+1] * sum2; 1310 sum4 -= v4[nz+1] * sum2; 1311 sum4 -= v4[nz+2] * sum3; 1312 1313 tmp[row ++]=sum1; 1314 tmp[row ++]=sum2; 1315 tmp[row ++]=sum3; 1316 tmp[row ++]=sum4; 1317 break; 1318 case 5: 1319 sum1 = b[r[row]]; 1320 sum2 = b[r[row+1]]; 1321 sum3 = b[r[row+2]]; 1322 sum4 = b[r[row+3]]; 1323 sum5 = b[r[row+4]]; 1324 v2 = aa + ai[row+1]; 1325 v3 = aa + ai[row+2]; 1326 v4 = aa + ai[row+3]; 1327 v5 = aa + ai[row+4]; 1328 1329 for (j=0; j<nz-1; j+=2){ 1330 i0 = vi[j]; 1331 i1 = vi[j+1]; 1332 tmp0 = tmps[i0]; 1333 tmp1 = tmps[i1]; 1334 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1335 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 1336 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 1337 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 1338 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 1339 } 1340 if (j == nz-1){ 1341 tmp0 = tmps[vi[j]]; 1342 sum1 -= v1[j] *tmp0; 1343 sum2 -= v2[j] *tmp0; 1344 sum3 -= v3[j] *tmp0; 1345 sum4 -= v4[j] *tmp0; 1346 sum5 -= v5[j] *tmp0; 1347 } 1348 1349 sum2 -= v2[nz] * sum1; 1350 sum3 -= v3[nz] * sum1; 1351 sum4 -= v4[nz] * sum1; 1352 sum5 -= v5[nz] * sum1; 1353 sum3 -= v3[nz+1] * sum2; 1354 sum4 -= v4[nz+1] * sum2; 1355 sum5 -= v5[nz+1] * sum2; 1356 sum4 -= v4[nz+2] * sum3; 1357 sum5 -= v5[nz+2] * sum3; 1358 sum5 -= v5[nz+3] * sum4; 1359 1360 tmp[row ++]=sum1; 1361 tmp[row ++]=sum2; 1362 tmp[row ++]=sum3; 1363 tmp[row ++]=sum4; 1364 tmp[row ++]=sum5; 1365 break; 1366 default: 1367 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1368 } 1369 } 1370 /* backward solve the upper triangular */ 1371 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 1372 nsz = ns[i]; 1373 aii = ad[row+1] + 1; 1374 v1 = aa + aii; 1375 vi = aj + aii; 1376 nz = ad[row]- ad[row+1] - 1; 1377 switch (nsz){ /* Each loop in 'case' is unrolled */ 1378 case 1 : 1379 sum1 = tmp[row]; 1380 1381 for(j=0 ; j<nz-1; j+=2){ 1382 i0 = vi[j]; 1383 i1 = vi[j+1]; 1384 tmp0 = tmps[i0]; 1385 tmp1 = tmps[i1]; 1386 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1387 } 1388 if (j == nz-1){ 1389 tmp0 = tmps[vi[j]]; 1390 sum1 -= v1[j]*tmp0; 1391 } 1392 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1393 break; 1394 case 2 : 1395 sum1 = tmp[row]; 1396 sum2 = tmp[row-1]; 1397 v2 = aa + ad[row] + 1; 1398 for (j=0 ; j<nz-1; j+=2){ 1399 i0 = vi[j]; 1400 i1 = vi[j+1]; 1401 tmp0 = tmps[i0]; 1402 tmp1 = tmps[i1]; 1403 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1404 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1405 } 1406 if (j == nz-1){ 1407 tmp0 = tmps[vi[j]]; 1408 sum1 -= v1[j]* tmp0; 1409 sum2 -= v2[j+1]* tmp0; 1410 } 1411 1412 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1413 sum2 -= v2[0] * tmp0; 1414 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1415 break; 1416 case 3 : 1417 sum1 = tmp[row]; 1418 sum2 = tmp[row -1]; 1419 sum3 = tmp[row -2]; 1420 v2 = aa + ad[row] + 1; 1421 v3 = aa + ad[row -1] + 1; 1422 for (j=0 ; j<nz-1; j+=2){ 1423 i0 = vi[j]; 1424 i1 = vi[j+1]; 1425 tmp0 = tmps[i0]; 1426 tmp1 = tmps[i1]; 1427 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1428 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1429 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1430 } 1431 if (j== nz-1){ 1432 tmp0 = tmps[vi[j]]; 1433 sum1 -= v1[j] * tmp0; 1434 sum2 -= v2[j+1] * tmp0; 1435 sum3 -= v3[j+2] * tmp0; 1436 } 1437 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1438 sum2 -= v2[0]* tmp0; 1439 sum3 -= v3[1] * tmp0; 1440 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1441 sum3 -= v3[0]* tmp0; 1442 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1443 1444 break; 1445 case 4 : 1446 sum1 = tmp[row]; 1447 sum2 = tmp[row -1]; 1448 sum3 = tmp[row -2]; 1449 sum4 = tmp[row -3]; 1450 v2 = aa + ad[row]+1; 1451 v3 = aa + ad[row -1]+1; 1452 v4 = aa + ad[row -2]+1; 1453 1454 for (j=0 ; j<nz-1; j+=2){ 1455 i0 = vi[j]; 1456 i1 = vi[j+1]; 1457 tmp0 = tmps[i0]; 1458 tmp1 = tmps[i1]; 1459 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1460 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1461 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1462 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1463 } 1464 if (j== nz-1){ 1465 tmp0 = tmps[vi[j]]; 1466 sum1 -= v1[j] * tmp0; 1467 sum2 -= v2[j+1] * tmp0; 1468 sum3 -= v3[j+2] * tmp0; 1469 sum4 -= v4[j+3] * tmp0; 1470 } 1471 1472 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1473 sum2 -= v2[0] * tmp0; 1474 sum3 -= v3[1] * tmp0; 1475 sum4 -= v4[2] * tmp0; 1476 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1477 sum3 -= v3[0] * tmp0; 1478 sum4 -= v4[1] * tmp0; 1479 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1480 sum4 -= v4[0] * tmp0; 1481 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1482 break; 1483 case 5 : 1484 sum1 = tmp[row]; 1485 sum2 = tmp[row -1]; 1486 sum3 = tmp[row -2]; 1487 sum4 = tmp[row -3]; 1488 sum5 = tmp[row -4]; 1489 v2 = aa + ad[row]+1; 1490 v3 = aa + ad[row -1]+1; 1491 v4 = aa + ad[row -2]+1; 1492 v5 = aa + ad[row -3]+1; 1493 for (j=0 ; j<nz-1; j+=2){ 1494 i0 = vi[j]; 1495 i1 = vi[j+1]; 1496 tmp0 = tmps[i0]; 1497 tmp1 = tmps[i1]; 1498 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 1499 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 1500 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 1501 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 1502 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 1503 } 1504 if (j==nz-1){ 1505 tmp0 = tmps[vi[j]]; 1506 sum1 -= v1[j] * tmp0; 1507 sum2 -= v2[j+1] * tmp0; 1508 sum3 -= v3[j+2] * tmp0; 1509 sum4 -= v4[j+3] * tmp0; 1510 sum5 -= v5[j+4] * tmp0; 1511 } 1512 1513 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 1514 sum2 -= v2[0] * tmp0; 1515 sum3 -= v3[1] * tmp0; 1516 sum4 -= v4[2] * tmp0; 1517 sum5 -= v5[3] * tmp0; 1518 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 1519 sum3 -= v3[0] * tmp0; 1520 sum4 -= v4[1] * tmp0; 1521 sum5 -= v5[2] * tmp0; 1522 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 1523 sum4 -= v4[0] * tmp0; 1524 sum5 -= v5[1] * tmp0; 1525 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 1526 sum5 -= v5[0] * tmp0; 1527 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 1528 break; 1529 default: 1530 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1531 } 1532 } 1533 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1534 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1535 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1536 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1537 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1538 PetscFunctionReturn(0); 1539 } 1540 1541 1542 /* 1543 Makes a longer coloring[] array and calls the usual code with that 1544 */ 1545 #undef __FUNCT__ 1546 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 1547 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1548 { 1549 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1550 PetscErrorCode ierr; 1551 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1552 PetscInt *colorused,i; 1553 ISColoringValue *newcolor; 1554 1555 PetscFunctionBegin; 1556 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1557 /* loop over inodes, marking a color for each column*/ 1558 row = 0; 1559 for (i=0; i<m; i++){ 1560 for (j=0; j<ns[i]; j++) { 1561 newcolor[row++] = coloring[i] + j*ncolors; 1562 } 1563 } 1564 1565 /* eliminate unneeded colors */ 1566 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1567 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1568 for (i=0; i<n; i++) { 1569 colorused[newcolor[i]] = 1; 1570 } 1571 1572 for (i=1; i<5*ncolors; i++) { 1573 colorused[i] += colorused[i-1]; 1574 } 1575 ncolors = colorused[5*ncolors-1]; 1576 for (i=0; i<n; i++) { 1577 newcolor[i] = colorused[newcolor[i]]-1; 1578 } 1579 ierr = PetscFree(colorused);CHKERRQ(ierr); 1580 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1581 ierr = PetscFree(coloring);CHKERRQ(ierr); 1582 PetscFunctionReturn(0); 1583 } 1584 1585 #include "../src/mat/blockinvert.h" 1586 1587 #undef __FUNCT__ 1588 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 1589 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1590 { 1591 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1592 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1593 MatScalar *ibdiag,*bdiag; 1594 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1595 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1596 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1597 PetscErrorCode ierr; 1598 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1599 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1600 1601 PetscFunctionBegin; 1602 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1603 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1604 if (its > 1) { 1605 /* switch to non-inode version */ 1606 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1607 PetscFunctionReturn(0); 1608 } 1609 1610 if (!a->inode.ibdiagvalid) { 1611 if (!a->inode.ibdiag) { 1612 /* calculate space needed for diagonal blocks */ 1613 for (i=0; i<m; i++) { 1614 cnt += sizes[i]*sizes[i]; 1615 } 1616 a->inode.bdiagsize = cnt; 1617 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1618 } 1619 1620 /* copy over the diagonal blocks and invert them */ 1621 ibdiag = a->inode.ibdiag; 1622 bdiag = a->inode.bdiag; 1623 cnt = 0; 1624 for (i=0, row = 0; i<m; i++) { 1625 for (j=0; j<sizes[i]; j++) { 1626 for (k=0; k<sizes[i]; k++) { 1627 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1628 } 1629 } 1630 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1631 1632 switch(sizes[i]) { 1633 case 1: 1634 /* Create matrix data structure */ 1635 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1636 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1637 break; 1638 case 2: 1639 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1640 break; 1641 case 3: 1642 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1643 break; 1644 case 4: 1645 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1646 break; 1647 case 5: 1648 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1649 break; 1650 default: 1651 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1652 } 1653 cnt += sizes[i]*sizes[i]; 1654 row += sizes[i]; 1655 } 1656 a->inode.ibdiagvalid = PETSC_TRUE; 1657 } 1658 ibdiag = a->inode.ibdiag; 1659 bdiag = a->inode.bdiag; 1660 1661 1662 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1663 if (flag & SOR_ZERO_INITIAL_GUESS) { 1664 PetscScalar *b; 1665 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1666 if (xx != bb) { 1667 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1668 } else { 1669 b = x; 1670 } 1671 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1672 1673 for (i=0, row=0; i<m; i++) { 1674 sz = diag[row] - ii[row]; 1675 v1 = a->a + ii[row]; 1676 idx = a->j + ii[row]; 1677 1678 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1679 switch (sizes[i]){ 1680 case 1: 1681 1682 sum1 = b[row]; 1683 for(n = 0; n<sz-1; n+=2) { 1684 i1 = idx[0]; 1685 i2 = idx[1]; 1686 idx += 2; 1687 tmp0 = x[i1]; 1688 tmp1 = x[i2]; 1689 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1690 } 1691 1692 if (n == sz-1){ 1693 tmp0 = x[*idx]; 1694 sum1 -= *v1 * tmp0; 1695 } 1696 x[row++] = sum1*(*ibdiag++); 1697 break; 1698 case 2: 1699 v2 = a->a + ii[row+1]; 1700 sum1 = b[row]; 1701 sum2 = b[row+1]; 1702 for(n = 0; n<sz-1; n+=2) { 1703 i1 = idx[0]; 1704 i2 = idx[1]; 1705 idx += 2; 1706 tmp0 = x[i1]; 1707 tmp1 = x[i2]; 1708 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1709 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1710 } 1711 1712 if (n == sz-1){ 1713 tmp0 = x[*idx]; 1714 sum1 -= v1[0] * tmp0; 1715 sum2 -= v2[0] * tmp0; 1716 } 1717 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1718 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1719 ibdiag += 4; 1720 break; 1721 case 3: 1722 v2 = a->a + ii[row+1]; 1723 v3 = a->a + ii[row+2]; 1724 sum1 = b[row]; 1725 sum2 = b[row+1]; 1726 sum3 = b[row+2]; 1727 for(n = 0; n<sz-1; n+=2) { 1728 i1 = idx[0]; 1729 i2 = idx[1]; 1730 idx += 2; 1731 tmp0 = x[i1]; 1732 tmp1 = x[i2]; 1733 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1734 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1735 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1736 } 1737 1738 if (n == sz-1){ 1739 tmp0 = x[*idx]; 1740 sum1 -= v1[0] * tmp0; 1741 sum2 -= v2[0] * tmp0; 1742 sum3 -= v3[0] * tmp0; 1743 } 1744 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1745 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1746 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1747 ibdiag += 9; 1748 break; 1749 case 4: 1750 v2 = a->a + ii[row+1]; 1751 v3 = a->a + ii[row+2]; 1752 v4 = a->a + ii[row+3]; 1753 sum1 = b[row]; 1754 sum2 = b[row+1]; 1755 sum3 = b[row+2]; 1756 sum4 = b[row+3]; 1757 for(n = 0; n<sz-1; n+=2) { 1758 i1 = idx[0]; 1759 i2 = idx[1]; 1760 idx += 2; 1761 tmp0 = x[i1]; 1762 tmp1 = x[i2]; 1763 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1764 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1765 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1766 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1767 } 1768 1769 if (n == sz-1){ 1770 tmp0 = x[*idx]; 1771 sum1 -= v1[0] * tmp0; 1772 sum2 -= v2[0] * tmp0; 1773 sum3 -= v3[0] * tmp0; 1774 sum4 -= v4[0] * tmp0; 1775 } 1776 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1777 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1778 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1779 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1780 ibdiag += 16; 1781 break; 1782 case 5: 1783 v2 = a->a + ii[row+1]; 1784 v3 = a->a + ii[row+2]; 1785 v4 = a->a + ii[row+3]; 1786 v5 = a->a + ii[row+4]; 1787 sum1 = b[row]; 1788 sum2 = b[row+1]; 1789 sum3 = b[row+2]; 1790 sum4 = b[row+3]; 1791 sum5 = b[row+4]; 1792 for(n = 0; n<sz-1; n+=2) { 1793 i1 = idx[0]; 1794 i2 = idx[1]; 1795 idx += 2; 1796 tmp0 = x[i1]; 1797 tmp1 = x[i2]; 1798 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1799 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1800 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1801 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1802 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1803 } 1804 1805 if (n == sz-1){ 1806 tmp0 = x[*idx]; 1807 sum1 -= v1[0] * tmp0; 1808 sum2 -= v2[0] * tmp0; 1809 sum3 -= v3[0] * tmp0; 1810 sum4 -= v4[0] * tmp0; 1811 sum5 -= v5[0] * tmp0; 1812 } 1813 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1814 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1815 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1816 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1817 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1818 ibdiag += 25; 1819 break; 1820 default: 1821 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1822 } 1823 } 1824 1825 xb = x; 1826 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1827 } else xb = b; 1828 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1829 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1830 cnt = 0; 1831 for (i=0, row=0; i<m; i++) { 1832 1833 switch (sizes[i]){ 1834 case 1: 1835 x[row++] *= bdiag[cnt++]; 1836 break; 1837 case 2: 1838 x1 = x[row]; x2 = x[row+1]; 1839 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1840 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1841 x[row++] = tmp1; 1842 x[row++] = tmp2; 1843 cnt += 4; 1844 break; 1845 case 3: 1846 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1847 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1848 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1849 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1850 x[row++] = tmp1; 1851 x[row++] = tmp2; 1852 x[row++] = tmp3; 1853 cnt += 9; 1854 break; 1855 case 4: 1856 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1857 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1858 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1859 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1860 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1861 x[row++] = tmp1; 1862 x[row++] = tmp2; 1863 x[row++] = tmp3; 1864 x[row++] = tmp4; 1865 cnt += 16; 1866 break; 1867 case 5: 1868 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1869 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1870 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1871 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1872 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1873 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1874 x[row++] = tmp1; 1875 x[row++] = tmp2; 1876 x[row++] = tmp3; 1877 x[row++] = tmp4; 1878 x[row++] = tmp5; 1879 cnt += 25; 1880 break; 1881 default: 1882 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1883 } 1884 } 1885 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1886 } 1887 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1888 1889 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1890 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1891 ibdiag -= sizes[i]*sizes[i]; 1892 sz = ii[row+1] - diag[row] - 1; 1893 v1 = a->a + diag[row] + 1; 1894 idx = a->j + diag[row] + 1; 1895 1896 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 1897 switch (sizes[i]){ 1898 case 1: 1899 1900 sum1 = xb[row]; 1901 for(n = 0; n<sz-1; n+=2) { 1902 i1 = idx[0]; 1903 i2 = idx[1]; 1904 idx += 2; 1905 tmp0 = x[i1]; 1906 tmp1 = x[i2]; 1907 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1908 } 1909 1910 if (n == sz-1){ 1911 tmp0 = x[*idx]; 1912 sum1 -= *v1*tmp0; 1913 } 1914 x[row--] = sum1*(*ibdiag); 1915 break; 1916 1917 case 2: 1918 1919 sum1 = xb[row]; 1920 sum2 = xb[row-1]; 1921 /* note that sum1 is associated with the second of the two rows */ 1922 v2 = a->a + diag[row-1] + 2; 1923 for(n = 0; n<sz-1; n+=2) { 1924 i1 = idx[0]; 1925 i2 = idx[1]; 1926 idx += 2; 1927 tmp0 = x[i1]; 1928 tmp1 = x[i2]; 1929 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1930 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1931 } 1932 1933 if (n == sz-1){ 1934 tmp0 = x[*idx]; 1935 sum1 -= *v1*tmp0; 1936 sum2 -= *v2*tmp0; 1937 } 1938 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1939 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1940 break; 1941 case 3: 1942 1943 sum1 = xb[row]; 1944 sum2 = xb[row-1]; 1945 sum3 = xb[row-2]; 1946 v2 = a->a + diag[row-1] + 2; 1947 v3 = a->a + diag[row-2] + 3; 1948 for(n = 0; n<sz-1; n+=2) { 1949 i1 = idx[0]; 1950 i2 = idx[1]; 1951 idx += 2; 1952 tmp0 = x[i1]; 1953 tmp1 = x[i2]; 1954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1957 } 1958 1959 if (n == sz-1){ 1960 tmp0 = x[*idx]; 1961 sum1 -= *v1*tmp0; 1962 sum2 -= *v2*tmp0; 1963 sum3 -= *v3*tmp0; 1964 } 1965 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 1966 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 1967 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 1968 break; 1969 case 4: 1970 1971 sum1 = xb[row]; 1972 sum2 = xb[row-1]; 1973 sum3 = xb[row-2]; 1974 sum4 = xb[row-3]; 1975 v2 = a->a + diag[row-1] + 2; 1976 v3 = a->a + diag[row-2] + 3; 1977 v4 = a->a + diag[row-3] + 4; 1978 for(n = 0; n<sz-1; n+=2) { 1979 i1 = idx[0]; 1980 i2 = idx[1]; 1981 idx += 2; 1982 tmp0 = x[i1]; 1983 tmp1 = x[i2]; 1984 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1985 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1986 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1987 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1988 } 1989 1990 if (n == sz-1){ 1991 tmp0 = x[*idx]; 1992 sum1 -= *v1*tmp0; 1993 sum2 -= *v2*tmp0; 1994 sum3 -= *v3*tmp0; 1995 sum4 -= *v4*tmp0; 1996 } 1997 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 1998 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 1999 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2000 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2001 break; 2002 case 5: 2003 2004 sum1 = xb[row]; 2005 sum2 = xb[row-1]; 2006 sum3 = xb[row-2]; 2007 sum4 = xb[row-3]; 2008 sum5 = xb[row-4]; 2009 v2 = a->a + diag[row-1] + 2; 2010 v3 = a->a + diag[row-2] + 3; 2011 v4 = a->a + diag[row-3] + 4; 2012 v5 = a->a + diag[row-4] + 5; 2013 for(n = 0; n<sz-1; n+=2) { 2014 i1 = idx[0]; 2015 i2 = idx[1]; 2016 idx += 2; 2017 tmp0 = x[i1]; 2018 tmp1 = x[i2]; 2019 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2020 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2021 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2022 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2023 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2024 } 2025 2026 if (n == sz-1){ 2027 tmp0 = x[*idx]; 2028 sum1 -= *v1*tmp0; 2029 sum2 -= *v2*tmp0; 2030 sum3 -= *v3*tmp0; 2031 sum4 -= *v4*tmp0; 2032 sum5 -= *v5*tmp0; 2033 } 2034 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2035 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2036 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2037 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2038 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2039 break; 2040 default: 2041 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2042 } 2043 } 2044 2045 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2046 } 2047 its--; 2048 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2049 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2050 } 2051 if (flag & SOR_EISENSTAT) { 2052 const PetscScalar *b; 2053 MatScalar *t = a->inode.ssor_work; 2054 2055 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2056 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2057 /* 2058 Apply (U + D)^-1 where D is now the block diagonal 2059 */ 2060 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2061 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2062 ibdiag -= sizes[i]*sizes[i]; 2063 sz = ii[row+1] - diag[row] - 1; 2064 v1 = a->a + diag[row] + 1; 2065 idx = a->j + diag[row] + 1; 2066 CHKMEMQ; 2067 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2068 switch (sizes[i]){ 2069 case 1: 2070 2071 sum1 = b[row]; 2072 for(n = 0; n<sz-1; n+=2) { 2073 i1 = idx[0]; 2074 i2 = idx[1]; 2075 idx += 2; 2076 tmp0 = x[i1]; 2077 tmp1 = x[i2]; 2078 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2079 } 2080 2081 if (n == sz-1){ 2082 tmp0 = x[*idx]; 2083 sum1 -= *v1*tmp0; 2084 } 2085 x[row] = sum1*(*ibdiag);row--; 2086 break; 2087 2088 case 2: 2089 2090 sum1 = b[row]; 2091 sum2 = b[row-1]; 2092 /* note that sum1 is associated with the second of the two rows */ 2093 v2 = a->a + diag[row-1] + 2; 2094 for(n = 0; n<sz-1; n+=2) { 2095 i1 = idx[0]; 2096 i2 = idx[1]; 2097 idx += 2; 2098 tmp0 = x[i1]; 2099 tmp1 = x[i2]; 2100 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2101 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2102 } 2103 2104 if (n == sz-1){ 2105 tmp0 = x[*idx]; 2106 sum1 -= *v1*tmp0; 2107 sum2 -= *v2*tmp0; 2108 } 2109 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2110 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2111 row -= 2; 2112 break; 2113 case 3: 2114 2115 sum1 = b[row]; 2116 sum2 = b[row-1]; 2117 sum3 = b[row-2]; 2118 v2 = a->a + diag[row-1] + 2; 2119 v3 = a->a + diag[row-2] + 3; 2120 for(n = 0; n<sz-1; n+=2) { 2121 i1 = idx[0]; 2122 i2 = idx[1]; 2123 idx += 2; 2124 tmp0 = x[i1]; 2125 tmp1 = x[i2]; 2126 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2127 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2128 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2129 } 2130 2131 if (n == sz-1){ 2132 tmp0 = x[*idx]; 2133 sum1 -= *v1*tmp0; 2134 sum2 -= *v2*tmp0; 2135 sum3 -= *v3*tmp0; 2136 } 2137 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2138 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2139 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2140 row -= 3; 2141 break; 2142 case 4: 2143 2144 sum1 = b[row]; 2145 sum2 = b[row-1]; 2146 sum3 = b[row-2]; 2147 sum4 = b[row-3]; 2148 v2 = a->a + diag[row-1] + 2; 2149 v3 = a->a + diag[row-2] + 3; 2150 v4 = a->a + diag[row-3] + 4; 2151 for(n = 0; n<sz-1; n+=2) { 2152 i1 = idx[0]; 2153 i2 = idx[1]; 2154 idx += 2; 2155 tmp0 = x[i1]; 2156 tmp1 = x[i2]; 2157 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2158 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2159 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2160 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2161 } 2162 2163 if (n == sz-1){ 2164 tmp0 = x[*idx]; 2165 sum1 -= *v1*tmp0; 2166 sum2 -= *v2*tmp0; 2167 sum3 -= *v3*tmp0; 2168 sum4 -= *v4*tmp0; 2169 } 2170 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2171 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2172 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2173 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2174 row -= 4; 2175 break; 2176 case 5: 2177 2178 sum1 = b[row]; 2179 sum2 = b[row-1]; 2180 sum3 = b[row-2]; 2181 sum4 = b[row-3]; 2182 sum5 = b[row-4]; 2183 v2 = a->a + diag[row-1] + 2; 2184 v3 = a->a + diag[row-2] + 3; 2185 v4 = a->a + diag[row-3] + 4; 2186 v5 = a->a + diag[row-4] + 5; 2187 for(n = 0; n<sz-1; n+=2) { 2188 i1 = idx[0]; 2189 i2 = idx[1]; 2190 idx += 2; 2191 tmp0 = x[i1]; 2192 tmp1 = x[i2]; 2193 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2194 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2195 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2196 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2197 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2198 } 2199 2200 if (n == sz-1){ 2201 tmp0 = x[*idx]; 2202 sum1 -= *v1*tmp0; 2203 sum2 -= *v2*tmp0; 2204 sum3 -= *v3*tmp0; 2205 sum4 -= *v4*tmp0; 2206 sum5 -= *v5*tmp0; 2207 } 2208 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2209 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2210 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2211 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2212 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2213 row -= 5; 2214 break; 2215 default: 2216 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2217 } 2218 CHKMEMQ; 2219 } 2220 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2221 2222 /* 2223 t = b - D x where D is the block diagonal 2224 */ 2225 cnt = 0; 2226 for (i=0, row=0; i<m; i++) { 2227 CHKMEMQ; 2228 switch (sizes[i]){ 2229 case 1: 2230 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2231 break; 2232 case 2: 2233 x1 = x[row]; x2 = x[row+1]; 2234 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2235 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2236 t[row] = b[row] - tmp1; 2237 t[row+1] = b[row+1] - tmp2; row += 2; 2238 cnt += 4; 2239 break; 2240 case 3: 2241 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2242 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2243 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2244 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2245 t[row] = b[row] - tmp1; 2246 t[row+1] = b[row+1] - tmp2; 2247 t[row+2] = b[row+2] - tmp3; row += 3; 2248 cnt += 9; 2249 break; 2250 case 4: 2251 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2252 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2253 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2254 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2255 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2256 t[row] = b[row] - tmp1; 2257 t[row+1] = b[row+1] - tmp2; 2258 t[row+2] = b[row+2] - tmp3; 2259 t[row+3] = b[row+3] - tmp4; row += 4; 2260 cnt += 16; 2261 break; 2262 case 5: 2263 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2264 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2265 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2266 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2267 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2268 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2269 t[row] = b[row] - tmp1; 2270 t[row+1] = b[row+1] - tmp2; 2271 t[row+2] = b[row+2] - tmp3; 2272 t[row+3] = b[row+3] - tmp4; 2273 t[row+4] = b[row+4] - tmp5;row += 5; 2274 cnt += 25; 2275 break; 2276 default: 2277 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2278 } 2279 CHKMEMQ; 2280 } 2281 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2282 2283 2284 2285 /* 2286 Apply (L + D)^-1 where D is the block diagonal 2287 */ 2288 for (i=0, row=0; i<m; i++) { 2289 sz = diag[row] - ii[row]; 2290 v1 = a->a + ii[row]; 2291 idx = a->j + ii[row]; 2292 CHKMEMQ; 2293 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2294 switch (sizes[i]){ 2295 case 1: 2296 2297 sum1 = t[row]; 2298 for(n = 0; n<sz-1; n+=2) { 2299 i1 = idx[0]; 2300 i2 = idx[1]; 2301 idx += 2; 2302 tmp0 = t[i1]; 2303 tmp1 = t[i2]; 2304 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2305 } 2306 2307 if (n == sz-1){ 2308 tmp0 = t[*idx]; 2309 sum1 -= *v1 * tmp0; 2310 } 2311 x[row] += t[row] = sum1*(*ibdiag++); row++; 2312 break; 2313 case 2: 2314 v2 = a->a + ii[row+1]; 2315 sum1 = t[row]; 2316 sum2 = t[row+1]; 2317 for(n = 0; n<sz-1; n+=2) { 2318 i1 = idx[0]; 2319 i2 = idx[1]; 2320 idx += 2; 2321 tmp0 = t[i1]; 2322 tmp1 = t[i2]; 2323 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2324 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2325 } 2326 2327 if (n == sz-1){ 2328 tmp0 = t[*idx]; 2329 sum1 -= v1[0] * tmp0; 2330 sum2 -= v2[0] * tmp0; 2331 } 2332 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2333 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2334 ibdiag += 4; row += 2; 2335 break; 2336 case 3: 2337 v2 = a->a + ii[row+1]; 2338 v3 = a->a + ii[row+2]; 2339 sum1 = t[row]; 2340 sum2 = t[row+1]; 2341 sum3 = t[row+2]; 2342 for(n = 0; n<sz-1; n+=2) { 2343 i1 = idx[0]; 2344 i2 = idx[1]; 2345 idx += 2; 2346 tmp0 = t[i1]; 2347 tmp1 = t[i2]; 2348 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2349 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2350 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2351 } 2352 2353 if (n == sz-1){ 2354 tmp0 = t[*idx]; 2355 sum1 -= v1[0] * tmp0; 2356 sum2 -= v2[0] * tmp0; 2357 sum3 -= v3[0] * tmp0; 2358 } 2359 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2360 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2361 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2362 ibdiag += 9; row += 3; 2363 break; 2364 case 4: 2365 v2 = a->a + ii[row+1]; 2366 v3 = a->a + ii[row+2]; 2367 v4 = a->a + ii[row+3]; 2368 sum1 = t[row]; 2369 sum2 = t[row+1]; 2370 sum3 = t[row+2]; 2371 sum4 = t[row+3]; 2372 for(n = 0; n<sz-1; n+=2) { 2373 i1 = idx[0]; 2374 i2 = idx[1]; 2375 idx += 2; 2376 tmp0 = t[i1]; 2377 tmp1 = t[i2]; 2378 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2379 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2380 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2381 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2382 } 2383 2384 if (n == sz-1){ 2385 tmp0 = t[*idx]; 2386 sum1 -= v1[0] * tmp0; 2387 sum2 -= v2[0] * tmp0; 2388 sum3 -= v3[0] * tmp0; 2389 sum4 -= v4[0] * tmp0; 2390 } 2391 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2392 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2393 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2394 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2395 ibdiag += 16; row += 4; 2396 break; 2397 case 5: 2398 v2 = a->a + ii[row+1]; 2399 v3 = a->a + ii[row+2]; 2400 v4 = a->a + ii[row+3]; 2401 v5 = a->a + ii[row+4]; 2402 sum1 = t[row]; 2403 sum2 = t[row+1]; 2404 sum3 = t[row+2]; 2405 sum4 = t[row+3]; 2406 sum5 = t[row+4]; 2407 for(n = 0; n<sz-1; n+=2) { 2408 i1 = idx[0]; 2409 i2 = idx[1]; 2410 idx += 2; 2411 tmp0 = t[i1]; 2412 tmp1 = t[i2]; 2413 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2414 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2415 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2416 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2417 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2418 } 2419 2420 if (n == sz-1){ 2421 tmp0 = t[*idx]; 2422 sum1 -= v1[0] * tmp0; 2423 sum2 -= v2[0] * tmp0; 2424 sum3 -= v3[0] * tmp0; 2425 sum4 -= v4[0] * tmp0; 2426 sum5 -= v5[0] * tmp0; 2427 } 2428 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2429 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2430 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2431 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2432 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2433 ibdiag += 25; row += 5; 2434 break; 2435 default: 2436 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2437 } 2438 CHKMEMQ; 2439 } 2440 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2441 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2442 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2443 } 2444 PetscFunctionReturn(0); 2445 } 2446 2447 #undef __FUNCT__ 2448 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 2449 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2450 { 2451 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2452 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2453 const MatScalar *bdiag = a->inode.bdiag; 2454 const PetscScalar *b; 2455 PetscErrorCode ierr; 2456 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2457 const PetscInt *sizes = a->inode.size; 2458 2459 PetscFunctionBegin; 2460 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2461 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2462 cnt = 0; 2463 for (i=0, row=0; i<m; i++) { 2464 switch (sizes[i]){ 2465 case 1: 2466 x[row] = b[row]*bdiag[cnt++];row++; 2467 break; 2468 case 2: 2469 x1 = b[row]; x2 = b[row+1]; 2470 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2471 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2472 x[row++] = tmp1; 2473 x[row++] = tmp2; 2474 cnt += 4; 2475 break; 2476 case 3: 2477 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2478 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2479 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2480 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2481 x[row++] = tmp1; 2482 x[row++] = tmp2; 2483 x[row++] = tmp3; 2484 cnt += 9; 2485 break; 2486 case 4: 2487 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2488 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2489 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2490 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2491 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2492 x[row++] = tmp1; 2493 x[row++] = tmp2; 2494 x[row++] = tmp3; 2495 x[row++] = tmp4; 2496 cnt += 16; 2497 break; 2498 case 5: 2499 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2500 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2501 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2502 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2503 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2504 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2505 x[row++] = tmp1; 2506 x[row++] = tmp2; 2507 x[row++] = tmp3; 2508 x[row++] = tmp4; 2509 x[row++] = tmp5; 2510 cnt += 25; 2511 break; 2512 default: 2513 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2514 } 2515 } 2516 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2518 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2519 PetscFunctionReturn(0); 2520 } 2521 2522 /* 2523 samestructure indicates that the matrix has not changed its nonzero structure so we 2524 do not need to recompute the inodes 2525 */ 2526 #undef __FUNCT__ 2527 #define __FUNCT__ "Mat_CheckInode" 2528 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2529 { 2530 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2531 PetscErrorCode ierr; 2532 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2533 PetscTruth flag; 2534 2535 PetscFunctionBegin; 2536 if (!a->inode.use) PetscFunctionReturn(0); 2537 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2538 2539 2540 m = A->rmap->n; 2541 if (a->inode.size) {ns = a->inode.size;} 2542 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2543 2544 i = 0; 2545 node_count = 0; 2546 idx = a->j; 2547 ii = a->i; 2548 while (i < m){ /* For each row */ 2549 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2550 /* Limits the number of elements in a node to 'a->inode.limit' */ 2551 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2552 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2553 if (nzy != nzx) break; 2554 idy += nzx; /* Same nonzero pattern */ 2555 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2556 if (!flag) break; 2557 } 2558 ns[node_count++] = blk_size; 2559 idx += blk_size*nzx; 2560 i = j; 2561 } 2562 /* If not enough inodes found,, do not use inode version of the routines */ 2563 if (!a->inode.size && m && node_count > .9*m) { 2564 ierr = PetscFree(ns);CHKERRQ(ierr); 2565 a->inode.node_count = 0; 2566 a->inode.size = PETSC_NULL; 2567 a->inode.use = PETSC_FALSE; 2568 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2569 } else { 2570 A->ops->mult = MatMult_SeqAIJ_Inode; 2571 A->ops->sor = MatSOR_SeqAIJ_Inode; 2572 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 2573 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 2574 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 2575 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 2576 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 2577 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 2578 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 2579 a->inode.node_count = node_count; 2580 a->inode.size = ns; 2581 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2582 } 2583 PetscFunctionReturn(0); 2584 } 2585 2586 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 2587 PetscInt __k, *__vi; \ 2588 __vi = aj + ai[row]; \ 2589 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 2590 __vi = aj + adiag[row]; \ 2591 cols[nzl] = __vi[0];\ 2592 __vi = aj + adiag[row+1]+1;\ 2593 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 2594 2595 #undef __FUNCT__ 2596 #define __FUNCT__ "Mat_CheckInode_FactorLU" 2597 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 2598 { 2599 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2600 PetscErrorCode ierr; 2601 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 2602 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 2603 PetscTruth flag; 2604 2605 PetscFunctionBegin; 2606 if (!a->inode.use) PetscFunctionReturn(0); 2607 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2608 2609 m = A->rmap->n; 2610 if (a->inode.size) {ns = a->inode.size;} 2611 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2612 2613 i = 0; 2614 node_count = 0; 2615 while (i < m){ /* For each row */ 2616 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 2617 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 2618 nzx = nzl1 + nzu1 + 1; 2619 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 2620 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 2621 2622 /* Limits the number of elements in a node to 'a->inode.limit' */ 2623 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2624 nzl2 = ai[j+1] - ai[j]; 2625 nzu2 = adiag[j] - adiag[j+1] - 1; 2626 nzy = nzl2 + nzu2 + 1; 2627 if( nzy != nzx) break; 2628 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 2629 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 2630 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2631 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 2632 ierr = PetscFree(cols2);CHKERRQ(ierr); 2633 } 2634 ns[node_count++] = blk_size; 2635 ierr = PetscFree(cols1);CHKERRQ(ierr); 2636 i = j; 2637 } 2638 /* If not enough inodes found,, do not use inode version of the routines */ 2639 if (!a->inode.size && m && node_count > .9*m) { 2640 ierr = PetscFree(ns);CHKERRQ(ierr); 2641 a->inode.node_count = 0; 2642 a->inode.size = PETSC_NULL; 2643 a->inode.use = PETSC_FALSE; 2644 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2645 } else { 2646 A->ops->mult = 0; 2647 A->ops->sor = 0; 2648 A->ops->multadd = 0; 2649 A->ops->getrowij = 0; 2650 A->ops->restorerowij = 0; 2651 A->ops->getcolumnij = 0; 2652 A->ops->restorecolumnij = 0; 2653 A->ops->coloringpatch = 0; 2654 A->ops->multdiagonalblock = 0; 2655 a->inode.node_count = node_count; 2656 a->inode.size = ns; 2657 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2658 } 2659 PetscFunctionReturn(0); 2660 } 2661 2662 /* 2663 This is really ugly. if inodes are used this replaces the 2664 permutations with ones that correspond to rows/cols of the matrix 2665 rather then inode blocks 2666 */ 2667 #undef __FUNCT__ 2668 #define __FUNCT__ "MatInodeAdjustForInodes" 2669 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2670 { 2671 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2672 2673 PetscFunctionBegin; 2674 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2675 if (f) { 2676 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2677 } 2678 PetscFunctionReturn(0); 2679 } 2680 2681 EXTERN_C_BEGIN 2682 #undef __FUNCT__ 2683 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 2684 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 2685 { 2686 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2687 PetscErrorCode ierr; 2688 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2689 const PetscInt *ridx,*cidx; 2690 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2691 PetscInt nslim_col,*ns_col; 2692 IS ris = *rperm,cis = *cperm; 2693 2694 PetscFunctionBegin; 2695 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2696 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2697 2698 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2699 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2700 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 2701 2702 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2703 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2704 2705 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2706 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2707 2708 /* Construct the permutations for rows*/ 2709 for (i=0,row = 0; i<nslim_row; ++i){ 2710 indx = ridx[i]; 2711 start_val = tns[indx]; 2712 end_val = tns[indx + 1]; 2713 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2714 } 2715 2716 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2717 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2718 2719 /* Construct permutations for columns */ 2720 for (i=0,col=0; i<nslim_col; ++i){ 2721 indx = cidx[i]; 2722 start_val = tns[indx]; 2723 end_val = tns[indx + 1]; 2724 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2725 } 2726 2727 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2728 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 2729 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2730 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 2731 2732 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2733 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2734 2735 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2736 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 2737 ierr = ISDestroy(cis);CHKERRQ(ierr); 2738 ierr = ISDestroy(ris);CHKERRQ(ierr); 2739 ierr = PetscFree(tns);CHKERRQ(ierr); 2740 PetscFunctionReturn(0); 2741 } 2742 EXTERN_C_END 2743 2744 #undef __FUNCT__ 2745 #define __FUNCT__ "MatInodeGetInodeSizes" 2746 /*@C 2747 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2748 2749 Collective on Mat 2750 2751 Input Parameter: 2752 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2753 2754 Output Parameter: 2755 + node_count - no of inodes present in the matrix. 2756 . sizes - an array of size node_count,with sizes of each inode. 2757 - limit - the max size used to generate the inodes. 2758 2759 Level: advanced 2760 2761 Notes: This routine returns some internal storage information 2762 of the matrix, it is intended to be used by advanced users. 2763 It should be called after the matrix is assembled. 2764 The contents of the sizes[] array should not be changed. 2765 PETSC_NULL may be passed for information not requested. 2766 2767 .keywords: matrix, seqaij, get, inode 2768 2769 .seealso: MatGetInfo() 2770 @*/ 2771 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2772 { 2773 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2774 2775 PetscFunctionBegin; 2776 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2777 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2778 if (f) { 2779 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2780 } 2781 PetscFunctionReturn(0); 2782 } 2783 2784 EXTERN_C_BEGIN 2785 #undef __FUNCT__ 2786 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 2787 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2788 { 2789 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2790 2791 PetscFunctionBegin; 2792 if (node_count) *node_count = a->inode.node_count; 2793 if (sizes) *sizes = a->inode.size; 2794 if (limit) *limit = a->inode.limit; 2795 PetscFunctionReturn(0); 2796 } 2797 EXTERN_C_END 2798