seq/seqviennacl/aijviennacl.cxx

e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp/*
e4a0ef16SKarl Rupp    Defines the basic matrix operations for the AIJ (compressed row)
e4a0ef16SKarl Rupp  matrix storage format.
e4a0ef16SKarl Rupp*/
e4a0ef16SKarl Rupp
aaa7dc30SBarry Smith#include <petscconf.h>
aaa7dc30SBarry Smith#include <../src/mat/impls/aij/seq/aij.h>          /*I "petscmat.h" I*/
aaa7dc30SBarry Smith#include <petscbt.h>
aaa7dc30SBarry Smith#include <../src/vec/vec/impls/dvecimpl.h>
aaa7dc30SBarry Smith#include <petsc-private/vecimpl.h>
e4a0ef16SKarl Rupp
aaa7dc30SBarry Smith#include <../src/mat/impls/aij/seq/seqviennacl/viennaclmatimpl.h>
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#include <algorithm>
e4a0ef16SKarl Rupp#include <vector>
e4a0ef16SKarl Rupp#include <string>
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#include "viennacl/linalg/prod.hpp"
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatViennaCLCopyToGPU"
e4a0ef16SKarl RuppPetscErrorCode MatViennaCLCopyToGPU(Mat A)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  Mat_SeqAIJViennaCL *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
e4a0ef16SKarl Rupp  Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
e4a0ef16SKarl Rupp  PetscErrorCode     ierr;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
67c87b7fSKarl Rupp  if (A->rmap->n > 0 && A->cmap->n > 0) { //some OpenCL SDKs have issues with buffers of size 0
e4a0ef16SKarl Rupp    if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED || A->valid_GPU_matrix == PETSC_VIENNACL_CPU) {
e4a0ef16SKarl Rupp      ierr = PetscLogEventBegin(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp      try {
1fc5b511SKarl Rupp        ierr = PetscObjectSetFromOptions_ViennaCL((PetscObject)A);CHKERRQ(ierr); /* Allows to set device type before allocating any objects */
e4a0ef16SKarl Rupp        if (a->compressedrow.use) {
*a3430c56SKarl Rupp          if (!viennaclstruct->compressed_mat) viennaclstruct->compressed_mat = new ViennaCLCompressedAIJMatrix();
e4a0ef16SKarl Rupp
*a3430c56SKarl Rupp          // Since PetscInt is different from cl_uint, we have to convert:
*a3430c56SKarl Rupp          viennacl::backend::mem_handle dummy;
e4a0ef16SKarl Rupp
*a3430c56SKarl Rupp          viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, a->compressedrow.nrows+1);
*a3430c56SKarl Rupp          for (PetscInt i=0; i<=a->compressedrow.nrows; ++i)
*a3430c56SKarl Rupp            row_buffer.set(i, (a->compressedrow.i)[i]);
e4a0ef16SKarl Rupp
*a3430c56SKarl Rupp          viennacl::backend::typesafe_host_array<unsigned int> row_indices; row_indices.raw_resize(dummy, a->compressedrow.nrows);
*a3430c56SKarl Rupp          for (PetscInt i=0; i<a->compressedrow.nrows; ++i)
*a3430c56SKarl Rupp            row_indices.set(i, (a->compressedrow.rindex)[i]);
*a3430c56SKarl Rupp
*a3430c56SKarl Rupp          viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
*a3430c56SKarl Rupp          for (PetscInt i=0; i<a->nz; ++i)
*a3430c56SKarl Rupp            col_buffer.set(i, (a->j)[i]);
*a3430c56SKarl Rupp
*a3430c56SKarl Rupp          viennaclstruct->compressed_mat->set(row_buffer.get(), row_indices.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->compressedrow.nrows, a->nz);
e4a0ef16SKarl Rupp        } else {
*a3430c56SKarl Rupp          if (!viennaclstruct->mat) viennaclstruct->mat = new ViennaCLAIJMatrix();
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp          // Since PetscInt is in general different from cl_uint, we have to convert:
e4a0ef16SKarl Rupp          viennacl::backend::mem_handle dummy;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp          viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(dummy, A->rmap->n+1);
e4a0ef16SKarl Rupp          for (PetscInt i=0; i<=A->rmap->n; ++i)
e4a0ef16SKarl Rupp            row_buffer.set(i, (a->i)[i]);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp          viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(dummy, a->nz);
e4a0ef16SKarl Rupp          for (PetscInt i=0; i<a->nz; ++i)
e4a0ef16SKarl Rupp            col_buffer.set(i, (a->j)[i]);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp          viennaclstruct->mat->set(row_buffer.get(), col_buffer.get(), a->a, A->rmap->n, A->cmap->n, a->nz);
e4a0ef16SKarl Rupp        }
4cf1874eSKarl Rupp        ViennaCLWaitForGPU();
4076e183SKarl Rupp      } catch(std::exception const & ex) {
4076e183SKarl Rupp        SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
e4a0ef16SKarl Rupp      }
e4a0ef16SKarl Rupp
*a3430c56SKarl Rupp      // Create temporary vector for v += A*x:
*a3430c56SKarl Rupp      if (viennaclstruct->tempvec) {
*a3430c56SKarl Rupp        if (viennaclstruct->tempvec->size() != static_cast<std::size_t>(a->nz)) {
*a3430c56SKarl Rupp          delete (ViennaCLVector*)viennaclstruct->tempvec;
*a3430c56SKarl Rupp          viennaclstruct->tempvec = new ViennaCLVector(a->nz);
*a3430c56SKarl Rupp        } else {
*a3430c56SKarl Rupp          viennaclstruct->tempvec->clear();
*a3430c56SKarl Rupp        }
*a3430c56SKarl Rupp      } else {
*a3430c56SKarl Rupp        viennaclstruct->tempvec = new ViennaCLVector(a->nz);
*a3430c56SKarl Rupp      }
*a3430c56SKarl Rupp
e4a0ef16SKarl Rupp      A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp      ierr = PetscLogEventEnd(MAT_ViennaCLCopyToGPU,A,0,0,0);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    }
67c87b7fSKarl Rupp  }
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatViennaCLCopyFromGPU"
0d73d530SKarl RuppPetscErrorCode MatViennaCLCopyFromGPU(Mat A, const ViennaCLAIJMatrix *Agpu)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  Mat_SeqAIJ         *a              = (Mat_SeqAIJ*)A->data;
e4a0ef16SKarl Rupp  PetscInt           m               = A->rmap->n;
e4a0ef16SKarl Rupp  PetscErrorCode     ierr;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
e4a0ef16SKarl Rupp  if (A->valid_GPU_matrix == PETSC_VIENNACL_UNALLOCATED) {
e4a0ef16SKarl Rupp    try {
e4a0ef16SKarl Rupp      if (a->compressedrow.use) {
e4a0ef16SKarl Rupp        SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL: Cannot handle row compression for GPU matrices");
e4a0ef16SKarl Rupp      } else {
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        if ((PetscInt)Agpu->size1() != m) SETERRQ2(PETSC_COMM_WORLD, PETSC_ERR_ARG_SIZ, "GPU matrix has %d rows, should be %d", Agpu->size1(), m);
e4a0ef16SKarl Rupp        a->nz           = Agpu->nnz();
e4a0ef16SKarl Rupp        a->maxnz        = a->nz; /* Since we allocate exactly the right amount */
e4a0ef16SKarl Rupp        A->preallocated = PETSC_TRUE;
e4a0ef16SKarl Rupp        if (a->singlemalloc) {
e4a0ef16SKarl Rupp          if (a->a) {ierr = PetscFree3(a->a,a->j,a->i);CHKERRQ(ierr);}
e4a0ef16SKarl Rupp        } else {
e4a0ef16SKarl Rupp          if (a->i) {ierr = PetscFree(a->i);CHKERRQ(ierr);}
e4a0ef16SKarl Rupp          if (a->j) {ierr = PetscFree(a->j);CHKERRQ(ierr);}
e4a0ef16SKarl Rupp          if (a->a) {ierr = PetscFree(a->a);CHKERRQ(ierr);}
e4a0ef16SKarl Rupp        }
dcca6d9dSJed Brown        ierr = PetscMalloc3(a->nz,&a->a,a->nz,&a->j,m+1,&a->i);CHKERRQ(ierr);
f7daeb2aSKarl Rupp        ierr = PetscLogObjectMemory((PetscObject)A, a->nz*(sizeof(PetscScalar)+sizeof(PetscInt))+(m+1)*sizeof(PetscInt));CHKERRQ(ierr);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        a->singlemalloc = PETSC_TRUE;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        /* Setup row lengths */
e4a0ef16SKarl Rupp        if (a->imax) {ierr = PetscFree2(a->imax,a->ilen);CHKERRQ(ierr);}
dcca6d9dSJed Brown        ierr = PetscMalloc2(m,&a->imax,m,&a->ilen);CHKERRQ(ierr);
f7daeb2aSKarl Rupp        ierr = PetscLogObjectMemory((PetscObject)A, 2*m*sizeof(PetscInt));CHKERRQ(ierr);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        /* Copy data back from GPU */
e4a0ef16SKarl Rupp        viennacl::backend::typesafe_host_array<unsigned int> row_buffer; row_buffer.raw_resize(Agpu->handle1(), Agpu->size1() + 1);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        // copy row array
e4a0ef16SKarl Rupp        viennacl::backend::memory_read(Agpu->handle1(), 0, row_buffer.raw_size(), row_buffer.get());
e4a0ef16SKarl Rupp        (a->i)[0] = row_buffer[0];
e4a0ef16SKarl Rupp        for (PetscInt i = 0; i < (PetscInt)Agpu->size1(); ++i) {
e4a0ef16SKarl Rupp          (a->i)[i+1] = row_buffer[i+1];
e4a0ef16SKarl Rupp          a->imax[i]  = a->ilen[i] = a->i[i+1] - a->i[i];  //Set imax[] and ilen[] arrays at the same time as i[] for better cache reuse
e4a0ef16SKarl Rupp        }
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        // copy column indices
e4a0ef16SKarl Rupp        viennacl::backend::typesafe_host_array<unsigned int> col_buffer; col_buffer.raw_resize(Agpu->handle2(), Agpu->nnz());
e4a0ef16SKarl Rupp        viennacl::backend::memory_read(Agpu->handle2(), 0, col_buffer.raw_size(), col_buffer.get());
e4a0ef16SKarl Rupp        for (PetscInt i=0; i < (PetscInt)Agpu->nnz(); ++i)
e4a0ef16SKarl Rupp          (a->j)[i] = col_buffer[i];
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp        // copy nonzero entries directly to destination (no conversion required)
e4a0ef16SKarl Rupp        viennacl::backend::memory_read(Agpu->handle(), 0, sizeof(PetscScalar)*Agpu->nnz(), a->a);
e4a0ef16SKarl Rupp
4cf1874eSKarl Rupp        ViennaCLWaitForGPU();
023073b3SKarl Rupp        /* TODO: Once a->diag is moved out of MatAssemblyEnd(), invalidate it here. */
e4a0ef16SKarl Rupp      }
4076e183SKarl Rupp    } catch(std::exception const & ex) {
4076e183SKarl Rupp      SETERRQ1(PETSC_COMM_SELF, PETSC_ERR_LIB, "ViennaCL error: %s", ex.what());
e4a0ef16SKarl Rupp    }
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp    /* This assembly prevents resetting the flag to PETSC_VIENNACL_CPU and recopying */
e4a0ef16SKarl Rupp    ierr = MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY);CHKERRQ(ierr);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp    A->valid_GPU_matrix = PETSC_VIENNACL_BOTH;
e4a0ef16SKarl Rupp  } else {
e4a0ef16SKarl Rupp    SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "ViennaCL error: Only valid for unallocated GPU matrices");
e4a0ef16SKarl Rupp  }
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatGetVecs_SeqAIJViennaCL"
e4a0ef16SKarl RuppPetscErrorCode MatGetVecs_SeqAIJViennaCL(Mat mat, Vec *right, Vec *left)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  PetscErrorCode ierr;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
e4a0ef16SKarl Rupp  if (right) {
e4a0ef16SKarl Rupp    ierr = VecCreate(PetscObjectComm((PetscObject)mat),right);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecSetSizes(*right,mat->cmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecSetBlockSize(*right,mat->rmap->bs);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecSetType(*right,VECSEQVIENNACL);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = PetscLayoutReference(mat->cmap,&(*right)->map);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  }
e4a0ef16SKarl Rupp  if (left) {
e4a0ef16SKarl Rupp    ierr = VecCreate(PetscObjectComm((PetscObject)mat),left);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecSetSizes(*left,mat->rmap->n,PETSC_DETERMINE);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecSetBlockSize(*left,mat->rmap->bs);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecSetType(*left,VECSEQVIENNACL);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = PetscLayoutReference(mat->rmap,&(*left)->map);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  }
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatMult_SeqAIJViennaCL"
e4a0ef16SKarl RuppPetscErrorCode MatMult_SeqAIJViennaCL(Mat A,Vec xx,Vec yy)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
e4a0ef16SKarl Rupp  PetscErrorCode       ierr;
e4a0ef16SKarl Rupp  Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
0d73d530SKarl Rupp  const ViennaCLVector *xgpu=NULL;
0d73d530SKarl Rupp  ViennaCLVector       *ygpu=NULL;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
67c87b7fSKarl Rupp  if (A->rmap->n > 0 && A->cmap->n > 0) {
e4a0ef16SKarl Rupp    ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecViennaCLGetArrayWrite(yy,&ygpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    try {
e4a0ef16SKarl Rupp      *ygpu = viennacl::linalg::prod(*viennaclstruct->mat,*xgpu);
4cf1874eSKarl Rupp      ViennaCLWaitForGPU();
4076e183SKarl Rupp    } catch (std::exception const & ex) {
4076e183SKarl Rupp      SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
e4a0ef16SKarl Rupp    }
e4a0ef16SKarl Rupp    ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = VecViennaCLRestoreArrayWrite(yy,&ygpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp    ierr = PetscLogFlops(2.0*a->nz - viennaclstruct->mat->nnz());CHKERRQ(ierr);
67c87b7fSKarl Rupp  }
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatMultAdd_SeqAIJViennaCL"
e4a0ef16SKarl RuppPetscErrorCode MatMultAdd_SeqAIJViennaCL(Mat A,Vec xx,Vec yy,Vec zz)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  Mat_SeqAIJ           *a = (Mat_SeqAIJ*)A->data;
e4a0ef16SKarl Rupp  PetscErrorCode       ierr;
e4a0ef16SKarl Rupp  Mat_SeqAIJViennaCL   *viennaclstruct = (Mat_SeqAIJViennaCL*)A->spptr;
0d73d530SKarl Rupp  const ViennaCLVector *xgpu=NULL,*ygpu=NULL;
0d73d530SKarl Rupp  ViennaCLVector       *zgpu=NULL;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
67c87b7fSKarl Rupp  if (A->rmap->n > 0 && A->cmap->n > 0) {
e4a0ef16SKarl Rupp    try {
e4a0ef16SKarl Rupp      ierr = VecViennaCLGetArrayRead(xx,&xgpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp      ierr = VecViennaCLGetArrayRead(yy,&ygpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp      ierr = VecViennaCLGetArrayWrite(zz,&zgpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp      if (a->compressedrow.use) {
*a3430c56SKarl Rupp        ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->compressed_mat, *xgpu);
e4a0ef16SKarl Rupp        *zgpu = *ygpu + temp;
4cf1874eSKarl Rupp        ViennaCLWaitForGPU();
e4a0ef16SKarl Rupp      } else {
*a3430c56SKarl Rupp        if (zz == xx || zz == yy) { //temporary required
*a3430c56SKarl Rupp          ViennaCLVector temp = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
*a3430c56SKarl Rupp          *zgpu = *ygpu;
*a3430c56SKarl Rupp          *zgpu += temp;
*a3430c56SKarl Rupp          ViennaCLWaitForGPU();
*a3430c56SKarl Rupp        } else {
*a3430c56SKarl Rupp          *viennaclstruct->tempvec = viennacl::linalg::prod(*viennaclstruct->mat, *xgpu);
*a3430c56SKarl Rupp          *zgpu = *ygpu + *viennaclstruct->tempvec;
4cf1874eSKarl Rupp          ViennaCLWaitForGPU();
e4a0ef16SKarl Rupp        }
e4a0ef16SKarl Rupp      }
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp      ierr = VecViennaCLRestoreArrayRead(xx,&xgpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp      ierr = VecViennaCLRestoreArrayRead(yy,&ygpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp      ierr = VecViennaCLRestoreArrayWrite(zz,&zgpu);CHKERRQ(ierr);
e4a0ef16SKarl Rupp
4076e183SKarl Rupp    } catch(std::exception const & ex) {
4076e183SKarl Rupp      SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
e4a0ef16SKarl Rupp    }
e4a0ef16SKarl Rupp    ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
67c87b7fSKarl Rupp  }
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatAssemblyEnd_SeqAIJViennaCL"
e4a0ef16SKarl RuppPetscErrorCode MatAssemblyEnd_SeqAIJViennaCL(Mat A,MatAssemblyType mode)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  PetscErrorCode ierr;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
e4a0ef16SKarl Rupp  ierr = MatAssemblyEnd_SeqAIJ(A,mode);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  ierr = MatViennaCLCopyToGPU(A);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(0);
e4a0ef16SKarl Rupp  A->ops->mult    = MatMult_SeqAIJViennaCL;
e4a0ef16SKarl Rupp  A->ops->multadd = MatMultAdd_SeqAIJViennaCL;
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp/* --------------------------------------------------------------------------------*/
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatCreateSeqAIJViennaCL"
e4a0ef16SKarl Rupp/*@
e4a0ef16SKarl Rupp   MatCreateSeqAIJViennaCL - Creates a sparse matrix in AIJ (compressed row) format
19fddfadSKarl Rupp   (the default parallel PETSc format).  This matrix will ultimately be pushed down
e4a0ef16SKarl Rupp   to GPUs and use the ViennaCL library for calculations. For good matrix
e4a0ef16SKarl Rupp   assembly performance the user should preallocate the matrix storage by setting
e4a0ef16SKarl Rupp   the parameter nz (or the array nnz).  By setting these parameters accurately,
e4a0ef16SKarl Rupp   performance during matrix assembly can be increased substantially.
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Collective on MPI_Comm
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Input Parameters:
e4a0ef16SKarl Rupp+  comm - MPI communicator, set to PETSC_COMM_SELF
e4a0ef16SKarl Rupp.  m - number of rows
e4a0ef16SKarl Rupp.  n - number of columns
e4a0ef16SKarl Rupp.  nz - number of nonzeros per row (same for all rows)
e4a0ef16SKarl Rupp-  nnz - array containing the number of nonzeros in the various rows
e4a0ef16SKarl Rupp         (possibly different for each row) or NULL
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Output Parameter:
e4a0ef16SKarl Rupp.  A - the matrix
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
e4a0ef16SKarl Rupp   MatXXXXSetPreallocation() paradigm instead of this routine directly.
e4a0ef16SKarl Rupp   [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Notes:
e4a0ef16SKarl Rupp   If nnz is given then nz is ignored
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   The AIJ format (also called the Yale sparse matrix format or
e4a0ef16SKarl Rupp   compressed row storage), is fully compatible with standard Fortran 77
e4a0ef16SKarl Rupp   storage.  That is, the stored row and column indices can begin at
e4a0ef16SKarl Rupp   either one (as in Fortran) or zero.  See the users' manual for details.
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Specify the preallocated storage with either nz or nnz (not both).
e4a0ef16SKarl Rupp   Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
e4a0ef16SKarl Rupp   allocation.  For large problems you MUST preallocate memory or you
e4a0ef16SKarl Rupp   will get TERRIBLE performance, see the users' manual chapter on matrices.
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Level: intermediate
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp.seealso: MatCreate(), MatCreateAIJ(), MatCreateAIJCUSP(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ()
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp@*/
e4a0ef16SKarl RuppPetscErrorCode  MatCreateSeqAIJViennaCL(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat *A)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  PetscErrorCode ierr;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
e4a0ef16SKarl Rupp  ierr = MatCreate(comm,A);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  ierr = MatSetSizes(*A,m,n,m,n);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  ierr = MatSetType(*A,MATSEQAIJVIENNACL);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  ierr = MatSeqAIJSetPreallocation_SeqAIJ(*A,nz,(PetscInt*)nnz);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatDestroy_SeqAIJViennaCL"
e4a0ef16SKarl RuppPetscErrorCode MatDestroy_SeqAIJViennaCL(Mat A)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  PetscErrorCode ierr;
e4a0ef16SKarl Rupp  Mat_SeqAIJViennaCL *viennaclcontainer = (Mat_SeqAIJViennaCL*)A->spptr;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
e4a0ef16SKarl Rupp  try {
*a3430c56SKarl Rupp    if (!viennaclcontainer->tempvec)        delete viennaclcontainer->tempvec;
*a3430c56SKarl Rupp    if (!viennaclcontainer->mat)            delete viennaclcontainer->mat;
*a3430c56SKarl Rupp    if (!viennaclcontainer->compressed_mat) delete viennaclcontainer->compressed_mat;
e4a0ef16SKarl Rupp    delete viennaclcontainer;
e4a0ef16SKarl Rupp    A->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
4076e183SKarl Rupp  } catch(std::exception const & ex) {
4076e183SKarl Rupp    SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"ViennaCL error: %s", ex.what());
e4a0ef16SKarl Rupp  }
e4a0ef16SKarl Rupp  /* this next line is because MatDestroy tries to PetscFree spptr if it is not zero, and PetscFree only works if the memory was allocated with PetscNew or PetscMalloc, which don't call the constructor */
e4a0ef16SKarl Rupp  A->spptr = 0;
e4a0ef16SKarl Rupp  ierr     = MatDestroy_SeqAIJ(A);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp#undef __FUNCT__
e4a0ef16SKarl Rupp#define __FUNCT__ "MatCreate_SeqAIJViennaCL"
e4a0ef16SKarl RuppPETSC_EXTERN PetscErrorCode MatCreate_SeqAIJViennaCL(Mat B)
e4a0ef16SKarl Rupp{
e4a0ef16SKarl Rupp  PetscErrorCode ierr;
e4a0ef16SKarl Rupp  Mat_SeqAIJ     *aij;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  PetscFunctionBegin;
e4a0ef16SKarl Rupp  ierr            = MatCreate_SeqAIJ(B);CHKERRQ(ierr);
e4a0ef16SKarl Rupp  aij             = (Mat_SeqAIJ*)B->data;
e4a0ef16SKarl Rupp  aij->inode.use  = PETSC_FALSE;
e4a0ef16SKarl Rupp  B->ops->mult    = MatMult_SeqAIJViennaCL;
e4a0ef16SKarl Rupp  B->ops->multadd = MatMultAdd_SeqAIJViennaCL;
e4a0ef16SKarl Rupp  B->spptr        = new Mat_SeqAIJViennaCL();
e4a0ef16SKarl Rupp
*a3430c56SKarl Rupp  ((Mat_SeqAIJViennaCL*)B->spptr)->tempvec        = NULL;
*a3430c56SKarl Rupp  ((Mat_SeqAIJViennaCL*)B->spptr)->mat            = NULL;
*a3430c56SKarl Rupp  ((Mat_SeqAIJViennaCL*)B->spptr)->compressed_mat = NULL;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  B->ops->assemblyend    = MatAssemblyEnd_SeqAIJViennaCL;
e4a0ef16SKarl Rupp  B->ops->destroy        = MatDestroy_SeqAIJViennaCL;
e4a0ef16SKarl Rupp  B->ops->getvecs        = MatGetVecs_SeqAIJViennaCL;
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  ierr = PetscObjectChangeTypeName((PetscObject)B,MATSEQAIJVIENNACL);CHKERRQ(ierr);
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  B->valid_GPU_matrix = PETSC_VIENNACL_UNALLOCATED;
e4a0ef16SKarl Rupp  PetscFunctionReturn(0);
e4a0ef16SKarl Rupp}
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp/*M
e4a0ef16SKarl Rupp   MATSEQAIJVIENNACL - MATAIJVIENNACL = "aijviennacl" = "seqaijviennacl" - A matrix type to be used for sparse matrices.
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   A matrix type type whose data resides on GPUs. These matrices are in CSR format by
e4a0ef16SKarl Rupp   default. All matrix calculations are performed using the ViennaCL library.
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp   Options Database Keys:
e4a0ef16SKarl Rupp+  -mat_type aijviennacl - sets the matrix type to "seqaijviennacl" during a call to MatSetFromOptions()
e4a0ef16SKarl Rupp.  -mat_viennacl_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
e4a0ef16SKarl Rupp-  -mat_viennacl_mult_storage_format csr - sets the storage format of matrices for MatMult during a call to MatSetFromOptions().
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp  Level: beginner
e4a0ef16SKarl Rupp
e4a0ef16SKarl Rupp.seealso: MatCreateSeqAIJViennaCL(), MATAIJVIENNACL, MatCreateAIJViennaCL()
e4a0ef16SKarl RuppM*/
e4a0ef16SKarl Rupp