Lines Matching defs:row
80 PetscInt i, row, slice_id, row_in_slice;
82 /* one thread per row. */
83 row = blockIdx.x * blockDim.x + threadIdx.x;
84 if (row < nrows) {
85 slice_id = row / sliceheight;
86 row_in_slice = row % sliceheight;
89 y[row] = sum;
95 PetscInt i, row, slice_id, row_in_slice;
97 /* one thread per row. */
98 row = blockIdx.x * blockDim.x + threadIdx.x;
99 if (row < nrows) {
100 slice_id = row / sliceheight;
101 row_in_slice = row % sliceheight;
104 z[row] = y[row] + sum;
115 PetscInt i, row, slice_id = blockIdx.x;
122 row = slice_id * sliceheight + threadIdx.x % sliceheight;
123 if (row < nrows) {
128 /* transpose layout to reduce each row using warp shfl */
136 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) y[row] = shared[0][threadIdx.x];
144 PetscInt i, row, slice_id = blockIdx.x;
151 row = slice_id * sliceheight + threadIdx.x % sliceheight;
152 if (row < nrows) {
157 /* transpose layout to reduce each row using warp shfl */
165 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) z[row] = y[row] + shared[0][threadIdx.x];
192 PetscInt gid, row, start_slice, cid;
212 row = slice_id * sliceheight + threadIdx.x % sliceheight;
213 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
216 if (row < nrows && gid < totalentries && write) atomAdd(y[row], t);
219 row = start_slice * sliceheight + threadIdx.x % sliceheight;
220 if (row < nrows) t += aval[gid] * x[acolidx[gid]];
226 /* transpose layout to reduce each row using warp shfl */
234 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(y[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
247 PetscInt gid, row, start_slice, cid;
267 row = slice_id * sliceheight + threadIdx.x % sliceheight;
268 if (row < nrows && gid < totalentries) t = aval[gid] * x[acolidx[gid]];
271 if (row < nrows && gid < totalentries && write) atomAdd(z[row], t);
274 row = start_slice * sliceheight + threadIdx.x % sliceheight;
275 if (row < nrows) t += aval[gid] * x[acolidx[gid]];
281 /* transpose layout to reduce each row using warp shfl */
289 if (row < nrows && threadIdx.y == 0 && threadIdx.x < sliceheight) atomAdd(z[row], shared[threadIdx.x]); /* shared[0][threadIdx.x] */
300 PetscInt i, row, slice_id;
302 row = slice_id * sliceheight + threadIdx.x % sliceheight;
304 if (row < nrows) {
309 if (row < nrows && threadIdx.x < sliceheight) y[row] = t;
315 PetscInt i, row, slice_id;
317 row = slice_id * sliceheight + threadIdx.x % sliceheight;
319 if (row < nrows) {
324 if (row < nrows && threadIdx.x < sliceheight) z[row] = y[row] + t;
334 PetscInt i, row, slice_id, row_in_slice;
335 /* multiple threads per row. */
336 row = blockIdx.x * blockDim.x + threadIdx.x;
337 if (row < nrows) {
338 slice_id = row / sliceheight;
339 row_in_slice = row % sliceheight;
354 y[row] = shared[threadIdx.x];
362 PetscInt i, row, slice_id, row_in_slice;
363 /* multiple threads per row. */
364 row = blockIdx.x * blockDim.x + threadIdx.x;
365 if (row < nrows) {
366 slice_id = row / sliceheight;
367 row_in_slice = row % sliceheight;
380 y[row] = shared[threadIdx.x];
388 PetscInt i, row, slice_id, row_in_slice;
389 /* multiple threads per row. */
390 row = blockIdx.x * blockDim.x + threadIdx.x;
391 if (row < nrows) {
392 slice_id = row / sliceheight;
393 row_in_slice = row % sliceheight;
404 y[row] = shared[threadIdx.x];
412 PetscInt i, row, slice_id, row_in_slice;
413 /* multiple threads per row. */
414 row = blockIdx.x * blockDim.x + threadIdx.x;
415 if (row < nrows) {
416 slice_id = row / sliceheight;
417 row_in_slice = row % sliceheight;
426 y[row] = shared[threadIdx.x];
434 PetscInt i, row, slice_id, row_in_slice;
435 /* multiple threads per row. */
436 row = blockIdx.x * blockDim.x + threadIdx.x;
437 if (row < nrows) {
438 slice_id = row / sliceheight;
439 row_in_slice = row % sliceheight;
446 y[row] = shared[threadIdx.x];
454 PetscInt i, row, slice_id, row_in_slice;
455 /* multiple threads per row. */
456 row = blockIdx.x * blockDim.x + threadIdx.x;
457 if (row < nrows) {
458 slice_id = row / sliceheight;
459 row_in_slice = row % sliceheight;
474 z[row] = y[row] + shared[threadIdx.x];
482 PetscInt i, row, slice_id, row_in_slice;
483 /* multiple threads per row. */
484 row = blockIdx.x * blockDim.x + threadIdx.x;
485 if (row < nrows) {
486 slice_id = row / sliceheight;
487 row_in_slice = row % sliceheight;
500 z[row] = y[row] + shared[threadIdx.x];
508 PetscInt i, row, slice_id, row_in_slice;
509 /* multiple threads per row. */
510 row = blockIdx.x * blockDim.x + threadIdx.x;
511 if (row < nrows) {
512 slice_id = row / sliceheight;
513 row_in_slice = row % sliceheight;
524 z[row] = y[row] + shared[threadIdx.x];
532 PetscInt i, row, slice_id, row_in_slice;
533 /* multiple threads per row. */
534 row = blockIdx.x * blockDim.x + threadIdx.x;
535 if (row < nrows) {
536 slice_id = row / sliceheight;
537 row_in_slice = row % sliceheight;
546 z[row] = y[row] + shared[threadIdx.x];
554 PetscInt i, row, slice_id, row_in_slice;
555 /* multiple threads per row. */
556 row = blockIdx.x * blockDim.x + threadIdx.x;
557 if (row < nrows) {
558 slice_id = row / sliceheight;
559 row_in_slice = row % sliceheight;
566 z[row] = y[row] + shared[threadIdx.x];