xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision 137a0714ac1fb268372008240a605d5d12b125f4)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed-impl.h>
18 #include <string.h>
19 #include "magma.h"
20 
21 typedef struct {
22     CeedScalar *array;
23     CeedScalar *darray;
24     int  own_;
25     int down_;
26 } CeedVector_Magma;
27 
28 typedef struct {
29     CeedInt *indices;
30     CeedInt *dindices;
31     int  own_;
32     int down_;            // cover a case where we own Device memory
33 } CeedElemRestriction_Magma;
34 
35 typedef struct {
36     CeedVector etmp;
37     CeedVector qdata;
38 } CeedOperator_Magma;
39 
40 // *****************************************************************************
41 // * Initialize vector vec (after free mem) with values from array based on cmode
42 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
43 // *                     to array, and data is copied (not store passed pointer)
44 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
45 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
46 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
47 // *****************************************************************************
48 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
49                                     CeedCopyMode cmode, CeedScalar *array) {
50     CeedVector_Magma *impl = vec->data;
51     int ierr;
52 
53     // If own data, free the "old" data, e.g., as it may be of different size
54     if (impl->own_){
55         magma_free( impl->darray );
56         magma_free_pinned( impl->array );
57         impl->darray = NULL;
58         impl->array  = NULL;
59         impl->own_ = 0;
60         impl->down_= 0;
61     }
62 
63     if (mtype == CEED_MEM_HOST) {
64         // memory is on the host; own_ = 0
65         switch (cmode) {
66         case CEED_COPY_VALUES:
67             ierr = magma_malloc( (void**)&impl->darray,
68                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
69             ierr = magma_malloc_pinned( (void**)&impl->array,
70                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
71             impl->own_ = 1;
72 
73             if (array != NULL)
74                 magma_setvector(vec->length, sizeof(array[0]),
75                                 array, 1, impl->darray, 1);
76             break;
77         case CEED_OWN_POINTER:
78             ierr = magma_malloc( (void**)&impl->darray,
79                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
80             // TODO: possible problem here is if we are passed non-pinned memory;
81             //       (as we own it, lter in destroy, we use free for pinned memory).
82             impl->array = array;
83             impl->own_ = 1;
84 
85             if (array != NULL)
86                 magma_setvector(vec->length, sizeof(array[0]),
87                                 array, 1, impl->darray, 1);
88             break;
89         case CEED_USE_POINTER:
90             ierr = magma_malloc( (void**)&impl->darray,
91                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
92             magma_setvector(vec->length, sizeof(array[0]),
93                             array, 1, impl->darray, 1);
94             impl->array  = array;
95         }
96     } else if (mtype == CEED_MEM_DEVICE) {
97         // memory is on the device; own = 0
98         switch (cmode) {
99         case CEED_COPY_VALUES:
100             ierr = magma_malloc( (void**)&impl->darray,
101                                 vec->length * sizeof(CeedScalar)); CeedChk(ierr);
102             ierr = magma_malloc_pinned( (void**)&impl->array,
103                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
104             impl->own_ = 1;
105 
106             if (array)
107                 magma_copyvector(vec->length, sizeof(array[0]),
108                                  array, 1, impl->darray, 1);
109             break;
110         case CEED_OWN_POINTER:
111             impl->darray = array;
112             ierr = magma_malloc_pinned( (void**)&impl->array,
113                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
114             impl->own_ = 1;
115 
116             break;
117         case CEED_USE_POINTER:
118             impl->darray = array;
119             impl->array  = NULL;
120         }
121 
122     } else
123         return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
124 
125     return 0;
126 }
127 
128 // *****************************************************************************
129 // * Give data pointer from vector vec to array (on HOST or DEVICE)
130 // *****************************************************************************
131 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
132                                     CeedScalar **array) {
133     CeedVector_Magma *impl = vec->data;
134     int ierr;
135 
136     if (mtype == CEED_MEM_HOST) {
137         if (impl->own_) {
138             // data is owned so GPU had the most up-to-date version; copy it
139             magma_getvector(vec->length, sizeof(*array[0]),
140                             impl->darray, 1, impl->array, 1);
141         } else if (impl->array == NULL) {
142             // Vector doesn't own the data and was set on GPU
143             if (impl->darray == NULL) {
144                 // call was made just to allocate memory
145                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
146                 CeedChk(ierr);
147             } else
148                 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
149         }
150         *array = impl->array;
151     } else if (mtype == CEED_MEM_DEVICE) {
152         if (impl->darray == NULL){
153             // Vector doesn't own the data and was set on the CPU
154             if (impl->array == NULL) {
155                 // call was made just to allocate memory
156                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
157                 CeedChk(ierr);
158             } else
159                 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
160         }
161         *array = impl->darray;
162     } else
163         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
164 
165     return 0;
166 }
167 
168 // *****************************************************************************
169 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
170 // *****************************************************************************
171 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
172                                         const CeedScalar **array) {
173     CeedVector_Magma *impl = vec->data;
174     int ierr;
175 
176     if (mtype == CEED_MEM_HOST) {
177         if (impl->own_) {
178             // data is owned so GPU had the most up-to-date version; copy it
179             magma_getvector(vec->length, sizeof(*array[0]),
180                             impl->darray, 1, impl->array, 1);
181         } else if (impl->array == NULL) {
182             // Vector doesn't own the data and was set on GPU
183             if (impl->darray == NULL) {
184                 // call was made just to allocate memory
185                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
186                 CeedChk(ierr);
187             } else
188                 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
189         }
190         *array = impl->array;
191     } else if (mtype == CEED_MEM_DEVICE) {
192         if (impl->darray == NULL){
193             // Vector doesn't own the data and was set on the CPU
194             if (impl->array == NULL) {
195                 // call was made just to allocate memory
196                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
197                 CeedChk(ierr);
198             } else
199                 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
200         }
201         *array = impl->darray;
202     } else
203         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
204 
205     return 0;
206 }
207 
208 // *****************************************************************************
209 // * There is no mtype here for array so it is not clear if we restore from HOST
210 // * memory or from DEVICE memory. We assume that it is CPU memory because if
211 // * it was GPU memory we would not call this routine at all.
212 // * Restore vector vec with values from array, where array received its values
213 // * from vec and possibly modified them.
214 // *****************************************************************************
215 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
216     CeedVector_Magma *impl = vec->data;
217 
218     // Check if the array is a CPU pointer
219     if (*array == impl->array) {
220         // Update device, if the device pointer is not NULL
221         if (impl->darray != NULL) {
222             magma_setvector(vec->length, sizeof(*array[0]),
223                             *array, 1, impl->darray, 1);
224         } else
225             ; // nothing to do (case of CPU use pointer)
226 
227     } else if (impl->down_) {
228         // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
229         magma_getvector(vec->length, sizeof(*array[0]),
230                         impl->darray, 1, impl->array, 1);
231     }
232 
233     *array = NULL;
234     return 0;
235 }
236 
237 // *****************************************************************************
238 // * There is no mtype here for array so it is not clear if we restore from HOST
239 // * memory or from DEVICE memory. We assume that it is CPU memory because if
240 // * it was GPU memory we would not call this routine at all.
241 // * Restore vector vec with values from array, where array received its values
242 // * from vec to only read them; in this case vec may have been modified meanwhile
243 // * and needs to be restored here.
244 // *****************************************************************************
245 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
246                                             const CeedScalar **array) {
247     CeedVector_Magma *impl = vec->data;
248 
249     // Check if the array is a CPU pointer
250     if (*array == impl->array) {
251         // Update device, if the device pointer is not NULL
252         if (impl->darray != NULL)
253             magma_setvector(vec->length, sizeof(*array[0]),
254                             *array, 1, impl->darray, 1);
255         else
256             ; // nothing to do (case of CPU use pointer)
257 
258     } else if (impl->down_) {
259         // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
260         magma_getvector(vec->length, sizeof(*array[0]),
261                         impl->darray, 1, impl->array, 1);
262     }
263 
264     *array = NULL;
265     return 0;
266 }
267 
268 static int CeedVectorDestroy_Magma(CeedVector vec) {
269     CeedVector_Magma *impl = vec->data;
270     int ierr;
271 
272     // Free if we own the data
273     if (impl->own_){
274         ierr = magma_free_pinned(impl->array); CeedChk(ierr);
275         ierr = magma_free(impl->darray);       CeedChk(ierr);
276     } else if (impl->down_){
277         ierr = magma_free(impl->darray);       CeedChk(ierr);
278     }
279     ierr = CeedFree(&vec->data); CeedChk(ierr);
280     return 0;
281 }
282 
283 // *****************************************************************************
284 // * Create vector vec of size n
285 // *****************************************************************************
286 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
287     CeedVector_Magma *impl;
288     int ierr;
289 
290     vec->SetArray = CeedVectorSetArray_Magma;
291     vec->GetArray = CeedVectorGetArray_Magma;
292     vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
293     vec->RestoreArray = CeedVectorRestoreArray_Magma;
294     vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
295     vec->Destroy = CeedVectorDestroy_Magma;
296     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
297     impl->darray = NULL;
298     impl->array  = NULL;
299     impl->own_ = 0;
300     impl->down_= 0;
301     vec->data = impl;
302     return 0;
303 }
304 
305 // *****************************************************************************
306 // * Apply restriction operator r to u: v = r(rmode) u
307 // *****************************************************************************
308 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
309                                           CeedTransposeMode tmode, CeedInt ncomp,
310                                           CeedTransposeMode lmode, CeedVector u,
311                                           CeedVector v, CeedRequest *request) {
312     CeedElemRestriction_Magma *impl = r->data;
313     int ierr;
314     const CeedScalar *uu;
315     CeedScalar *vv;
316     CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof;
317     CeedInt esize = nelem * elemsize;
318     // CeedInt *indices = impl->indices;
319     CeedInt *dindices = impl->dindices;
320 
321     //ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
322     //ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
323 
324     // Get pointers on the device
325     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
326     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
327 
328     if (tmode == CEED_NOTRANSPOSE) {
329         // Perform: v = r * u
330         if (ncomp == 1) {
331             //for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
332 
333             magma_template<<i=0:esize>>
334                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices){
335                 vv[i] = uu[dindices[i]];
336             }
337         } else {
338             // vv is (elemsize x ncomp x nelem), column-major
339             if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
340                 /*
341                 for (CeedInt e = 0; e < nelem; e++)
342                     for (CeedInt d = 0; d < ncomp; d++)
343                         for (CeedInt i=0; i < elemsize; i++) {
344                             vv[i + elemsize*(d+ncomp*e)] =
345                                 uu[indices[i+elemsize*e]+ndof*d];
346                         }
347                 */
348                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
349                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
350                     vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
351                 }
352             } else { // u is (ncomp x ndof), column-major
353                 /*
354                 for (CeedInt e = 0; e < nelem; e++)
355                     for (CeedInt d = 0; d < ncomp; d++)
356                         for (CeedInt i=0; i< elemsize; i++) {
357                             vv[i + elemsize*(d+ncomp*e)] =
358                                 uu[d+ncomp*indices[i+elemsize*e]];
359                         }
360                 */
361                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
362                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
363                     vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
364                 }
365             }
366         }
367     } else {
368         // Note: in transpose mode, we perform: v += r^t * u
369         if (ncomp == 1) {
370             // fprintf(stderr,"3 ---------\n");
371             // for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
372             magma_template<<i=0:esize>>
373                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
374                 magmablas_datomic_add( &vv[dindices[i]], uu[i]);
375             }
376         } else { // u is (elemsize x ncomp x nelem)
377             fprintf(stderr,"2 ---------\n");
378 
379             if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
380                 /*
381                 for (CeedInt e = 0; e < nelem; e++)
382                     for (CeedInt d = 0; d < ncomp; d++)
383                         for (CeedInt i=0; i < elemsize; i++) {
384                             vv[indices[i + elemsize*e]+ndof*d] +=
385                                 uu[i + elemsize*(d+e*ncomp)];
386                         }
387                 */
388                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
389                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
390                     magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
391                                            uu[i+iend*(d+e*dend)]);
392                 }
393             } else { // vv is (ncomp x ndof), column-major
394                 /*
395                 for (CeedInt e = 0; e < nelem; e++)
396                     for (CeedInt d = 0; d < ncomp; d++)
397                         for (CeedInt i=0; i < elemsize; i++) {
398                             vv[d+ncomp*indices[i + elemsize*e]] +=
399                                 uu[i + elemsize*(d+e*ncomp)];
400                         }
401                 */
402                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
403                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
404                     magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
405                                            uu[i+iend*(d+e*dend)]);
406                 }
407             }
408         }
409     }
410     ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
411     ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
412 
413     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
414         *request = NULL;
415     return 0;
416 }
417 
418 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
419   CeedElemRestriction_Magma *impl = r->data;
420   int ierr;
421 
422   // Free if we own the data
423   if (impl->own_){
424       ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
425       ierr = magma_free(impl->dindices);       CeedChk(ierr);
426   }
427   else if (impl->down_){
428       ierr = magma_free(impl->dindices);       CeedChk(ierr);
429   }
430   ierr = CeedFree(&r->data); CeedChk(ierr);
431   return 0;
432 }
433 
434 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
435                                            CeedMemType mtype,
436                                            CeedCopyMode cmode,
437                                            const CeedInt *indices) {
438     int ierr, size = r->nelem*r->elemsize;
439     CeedElemRestriction_Magma *impl;
440 
441     // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
442     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
443     impl->dindices = NULL;
444     impl->indices  = NULL;
445     impl->own_ = 0;
446     impl->down_= 0;
447 
448     if (mtype == CEED_MEM_HOST) {
449         // memory is on the host; own_ = 0
450         switch (cmode) {
451         case CEED_COPY_VALUES:
452             ierr = magma_malloc( (void**)&impl->dindices,
453                                   size * sizeof(CeedInt)); CeedChk(ierr);
454             ierr = magma_malloc_pinned( (void**)&impl->indices,
455                                         size * sizeof(CeedInt)); CeedChk(ierr);
456             impl->own_ = 1;
457 
458             if (indices != NULL)
459                 magma_setvector(size, sizeof(CeedInt),
460                                 indices, 1, impl->dindices, 1);
461             break;
462         case CEED_OWN_POINTER:
463             ierr = magma_malloc( (void**)&impl->dindices,
464                                  size * sizeof(CeedInt)); CeedChk(ierr);
465             // TODO: possible problem here is if we are passed non-pinned memory;
466             //       (as we own it, lter in destroy, we use free for pinned memory).
467             impl->indices = (CeedInt *)indices;
468             impl->own_ = 1;
469 
470             if (indices != NULL)
471                 magma_setvector(size, sizeof(CeedInt),
472                                 indices, 1, impl->dindices, 1);
473             break;
474         case CEED_USE_POINTER:
475             ierr = magma_malloc( (void**)&impl->dindices,
476                                  size * sizeof(CeedInt)); CeedChk(ierr);
477             magma_setvector(size, sizeof(CeedInt),
478                             indices, 1, impl->dindices, 1);
479             impl->down_ = 1;
480             impl->indices  = (CeedInt *)indices;
481         }
482     } else if (mtype == CEED_MEM_DEVICE) {
483         // memory is on the device; own = 0
484         switch (cmode) {
485         case CEED_COPY_VALUES:
486             ierr = magma_malloc( (void**)&impl->dindices,
487                                  size * sizeof(CeedInt)); CeedChk(ierr);
488             ierr = magma_malloc_pinned( (void**)&impl->indices,
489                                         size * sizeof(CeedInt)); CeedChk(ierr);
490             impl->own_ = 1;
491 
492             if (indices)
493                 magma_copyvector(size, sizeof(CeedInt),
494                                  indices, 1, impl->dindices, 1);
495             break;
496         case CEED_OWN_POINTER:
497             impl->dindices = (CeedInt *)indices;
498             ierr = magma_malloc_pinned( (void**)&impl->indices,
499                                         size * sizeof(CeedInt)); CeedChk(ierr);
500             impl->own_ = 1;
501 
502             break;
503         case CEED_USE_POINTER:
504             impl->dindices = (CeedInt *)indices;
505             impl->indices  = NULL;
506         }
507 
508     } else
509         return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
510 
511     r->data    = impl;
512     r->Apply   = CeedElemRestrictionApply_Magma;
513     r->Destroy = CeedElemRestrictionDestroy_Magma;
514 
515     return 0;
516 }
517 
518 // Contracts on the middle index
519 // NOTRANSPOSE: V_ajc = T_jb U_abc
520 // TRANSPOSE:   V_ajc = T_bj U_abc
521 // If Add != 0, "=" is replaced by "+="
522 static int CeedTensorContract_Magma(Ceed ceed,
523                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
524                                     const CeedScalar *t, CeedTransposeMode tmode,
525                                     const CeedInt Add,
526                                     const CeedScalar *u, CeedScalar *v) {
527     CeedInt tstride0 = B, tstride1 = 1;
528     if (tmode == CEED_TRANSPOSE) {
529         tstride0 = 1; tstride1 = J;
530     }
531 
532     for (CeedInt a=0; a<A; a++) {
533         for (CeedInt j=0; j<J; j++) {
534             if (!Add) {
535                 for (CeedInt c=0; c<C; c++)
536                     v[(a*J+j)*C+c] = 0;
537             }
538             for (CeedInt b=0; b<B; b++) {
539                 for (CeedInt c=0; c<C; c++) {
540                     v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
541                 }
542             }
543         }
544     }
545     return 0;
546 }
547 
548 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode,
549                                 CeedEvalMode emode,
550                                 const CeedScalar *u, CeedScalar *v) {
551     int ierr;
552     const CeedInt dim = basis->dim;
553     const CeedInt ndof = basis->ndof;
554     const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim);
555     const CeedInt add = (tmode == CEED_TRANSPOSE);
556 
557     if (tmode == CEED_TRANSPOSE) {
558         const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim);
559         for (CeedInt i = 0; i < vsize; i++)
560             v[i] = (CeedScalar) 0;
561     }
562     if (emode & CEED_EVAL_INTERP) {
563         CeedInt P = basis->P1d, Q = basis->Q1d;
564         if (tmode == CEED_TRANSPOSE) {
565             P = basis->Q1d; Q = basis->P1d;
566         }
567         CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
568         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
569         for (CeedInt d=0; d<dim; d++) {
570             ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
571                                             tmode, add&&(d==dim-1),
572                                             d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
573             CeedChk(ierr);
574             pre /= P;
575             post *= Q;
576         }
577         if (tmode == CEED_NOTRANSPOSE) {
578             v += nqpt;
579         } else {
580             u += nqpt;
581         }
582     }
583     if (emode & CEED_EVAL_GRAD) {
584         CeedInt P = basis->P1d, Q = basis->Q1d;
585         // In CEED_NOTRANSPOSE mode:
586         // u is (P^dim x nc), column-major layout (nc = ndof)
587         // v is (Q^dim x nc x dim), column-major layout (nc = ndof)
588         // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
589         if (tmode == CEED_TRANSPOSE) {
590             P = basis->Q1d, Q = basis->P1d;
591         }
592         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
593         for (CeedInt p = 0; p < dim; p++) {
594             CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
595             for (CeedInt d=0; d<dim; d++) {
596                 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
597                                                 (p==d)?basis->grad1d:basis->interp1d,
598                                                 tmode, add&&(d==dim-1),
599                                                 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
600                 CeedChk(ierr);
601                 pre /= P;
602                 post *= Q;
603             }
604             if (tmode == CEED_NOTRANSPOSE) {
605                 v += nqpt;
606             } else {
607                 u += nqpt;
608             }
609         }
610     }
611     if (emode & CEED_EVAL_WEIGHT) {
612         if (tmode == CEED_TRANSPOSE)
613             return CeedError(basis->ceed, 1,
614                              "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
615         CeedInt Q = basis->Q1d;
616         for (CeedInt d=0; d<dim; d++) {
617             CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
618             for (CeedInt i=0; i<pre; i++) {
619                 for (CeedInt j=0; j<Q; j++) {
620                     for (CeedInt k=0; k<post; k++) {
621                         v[(i*Q + j)*post + k] = basis->qweight1d[j]
622                             * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
623                     }
624                 }
625             }
626         }
627     }
628     return 0;
629 }
630 
631 static int CeedBasisDestroy_Magma(CeedBasis basis) {
632     return 0;
633 }
634 
635 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
636                                          CeedInt Q1d, const CeedScalar *interp1d,
637                                          const CeedScalar *grad1d,
638                                          const CeedScalar *qref1d,
639                                          const CeedScalar *qweight1d,
640                                          CeedBasis basis) {
641     basis->Apply = CeedBasisApply_Magma;
642     basis->Destroy = CeedBasisDestroy_Magma;
643     return 0;
644 }
645 
646 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q,
647                                     const CeedScalar *const *u,
648                                     CeedScalar *const *v) {
649     int ierr;
650     ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr);
651     return 0;
652 }
653 
654 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
655     return 0;
656 }
657 
658 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
659     qf->Apply = CeedQFunctionApply_Magma;
660     qf->Destroy = CeedQFunctionDestroy_Magma;
661     return 0;
662 }
663 
664 static int CeedOperatorDestroy_Magma(CeedOperator op) {
665     CeedOperator_Magma *impl = op->data;
666     int ierr;
667 
668     ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr);
669     ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr);
670     ierr = CeedFree(&op->data); CeedChk(ierr);
671     return 0;
672 }
673 
674 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata,
675                                    CeedVector ustate,
676                                    CeedVector residual, CeedRequest *request) {
677     CeedOperator_Magma *impl = op->data;
678     CeedVector etmp;
679     CeedInt Q;
680     const CeedInt nc = op->basis->ndof, dim = op->basis->dim;
681     CeedScalar *Eu;
682     char *qd;
683     int ierr;
684     CeedTransposeMode lmode = CEED_NOTRANSPOSE;
685 
686     if (!impl->etmp) {
687         ierr = CeedVectorCreate(op->ceed,
688                                 nc * op->Erestrict->nelem * op->Erestrict->elemsize,
689                                 &impl->etmp); CeedChk(ierr);
690         // etmp is allocated when CeedVectorGetArray is called below
691     }
692     etmp = impl->etmp;
693     if (op->qf->inmode & ~CEED_EVAL_WEIGHT) {
694         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE,
695                                         nc, lmode, ustate, etmp,
696                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
697     }
698     ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
699     ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr);
700     ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd);
701     CeedChk(ierr);
702     for (CeedInt e=0; e<op->Erestrict->nelem; e++) {
703         CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0};
704         const CeedScalar *in[5] = {0,0,0,0,0};
705         // TODO: quadrature weights can be computed just once
706         ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode,
707                               &Eu[e*op->Erestrict->elemsize*nc], BEu);
708         CeedChk(ierr);
709         CeedScalar *u_ptr = BEu, *v_ptr = BEv;
710         if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; }
711         if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; }
712         if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; }
713         if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; }
714         if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; }
715         ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out);
716         CeedChk(ierr);
717         ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv,
718                               &Eu[e*op->Erestrict->elemsize*nc]);
719         CeedChk(ierr);
720     }
721     ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr);
722     if (residual) {
723         CeedScalar *res;
724         CeedVectorGetArray(residual, CEED_MEM_HOST, &res);
725         for (int i = 0; i < residual->length; i++)
726             res[i] = (CeedScalar)0;
727         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE,
728                                         nc, lmode, etmp, residual,
729                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
730     }
731     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
732         *request = NULL;
733     return 0;
734 }
735 
736 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) {
737     CeedOperator_Magma *impl = op->data;
738     int ierr;
739 
740     if (!impl->qdata) {
741         CeedInt Q;
742         ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
743         ierr = CeedVectorCreate(op->ceed,
744                                 op->Erestrict->nelem * Q
745                                 * op->qf->qdatasize / sizeof(CeedScalar),
746                                 &impl->qdata); CeedChk(ierr);
747     }
748     *qdata = impl->qdata;
749     return 0;
750 }
751 
752 static int CeedOperatorCreate_Magma(CeedOperator op) {
753     CeedOperator_Magma *impl;
754     int ierr;
755 
756     ierr = CeedCalloc(1, &impl); CeedChk(ierr);
757     op->data = impl;
758     op->Destroy  = CeedOperatorDestroy_Magma;
759     op->Apply    = CeedOperatorApply_Magma;
760     op->GetQData = CeedOperatorGetQData_Magma;
761     return 0;
762 }
763 
764 // *****************************************************************************
765 // * INIT
766 // *****************************************************************************
767 static int CeedInit_Magma(const char *resource, Ceed ceed) {
768     if (strcmp(resource, "/cpu/magma")
769         && strcmp(resource, "/gpu/magma"))
770         return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
771 
772     magma_init();
773     //magma_print_environment();
774 
775     ceed->VecCreate = CeedVectorCreate_Magma;
776     ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
777     ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
778     ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
779     ceed->OperatorCreate = CeedOperatorCreate_Magma;
780     return 0;
781 }
782 
783 // *****************************************************************************
784 // * REGISTER
785 // *****************************************************************************
786 __attribute__((constructor))
787 static void Register(void) {
788     CeedRegister("/cpu/magma", CeedInit_Magma);
789     CeedRegister("/gpu/magma", CeedInit_Magma);
790 }
791