xref: /libCEED/rust/libceed-sys/c-src/backends/magma/ceed-magma.c (revision 38612b0832b79d51c892509d4785901c2cb6baa5)
1 // Copyright (c) 2017, Lawrence Livermore National Security, LLC. Produced at
2 // the Lawrence Livermore National Laboratory. LLNL-CODE-734707. All Rights
3 // reserved. See files LICENSE and NOTICE for details.
4 //
5 // This file is part of CEED, a collection of benchmarks, miniapps, software
6 // libraries and APIs for efficient high-order finite element and spectral
7 // element discretizations for exascale applications. For more information and
8 // source code availability see http://github.com/ceed.
9 //
10 // The CEED research is supported by the Exascale Computing Project 17-SC-20-SC,
11 // a collaborative effort of two U.S. Department of Energy organizations (Office
12 // of Science and the National Nuclear Security Administration) responsible for
13 // the planning and preparation of a capable exascale ecosystem, including
14 // software, applications, hardware, advanced system engineering and early
15 // testbed platforms, in support of the nation's exascale computing imperative.
16 
17 #include <ceed-impl.h>
18 #include <string.h>
19 #include "magma.h"
20 #include "ceed-magma.h"
21 
22 typedef struct {
23     CeedScalar *array;
24     CeedScalar *darray;
25     int  own_;
26     int down_;
27 } CeedVector_Magma;
28 
29 typedef struct {
30     CeedInt *indices;
31     CeedInt *dindices;
32     int  own_;
33     int down_;            // cover a case where we own Device memory
34 } CeedElemRestriction_Magma;
35 
36 typedef struct {
37     CeedVector etmp;
38     CeedVector qdata;
39 } CeedOperator_Magma;
40 
41 // *****************************************************************************
42 // * Initialize vector vec (after free mem) with values from array based on cmode
43 // *   CEED_COPY_VALUES: memory is allocated in vec->array_allocated, made equal
44 // *                     to array, and data is copied (not store passed pointer)
45 // *   CEED_OWN_POINTER: vec->data->array_allocated and vec->data->array = array
46 // *   CEED_USE_POINTER: vec->data->array = array (can modify; no ownership)
47 // * mtype: CEED_MEM_HOST or CEED_MEM_DEVICE
48 // *****************************************************************************
49 static int CeedVectorSetArray_Magma(CeedVector vec, CeedMemType mtype,
50                                     CeedCopyMode cmode, CeedScalar *array) {
51     CeedVector_Magma *impl = vec->data;
52     int ierr;
53 
54     // If own data, free the "old" data, e.g., as it may be of different size
55     if (impl->own_){
56         magma_free( impl->darray );
57         magma_free_pinned( impl->array );
58         impl->darray = NULL;
59         impl->array  = NULL;
60         impl->own_ = 0;
61         impl->down_= 0;
62     }
63 
64     if (mtype == CEED_MEM_HOST) {
65         // memory is on the host; own_ = 0
66         switch (cmode) {
67         case CEED_COPY_VALUES:
68             ierr = magma_malloc( (void**)&impl->darray,
69                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
70             ierr = magma_malloc_pinned( (void**)&impl->array,
71                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
72             impl->own_ = 1;
73 
74             if (array != NULL)
75                 magma_setvector(vec->length, sizeof(array[0]),
76                                 array, 1, impl->darray, 1);
77             break;
78         case CEED_OWN_POINTER:
79             ierr = magma_malloc( (void**)&impl->darray,
80                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
81             // TODO: possible problem here is if we are passed non-pinned memory;
82             //       (as we own it, lter in destroy, we use free for pinned memory).
83             impl->array = array;
84             impl->own_ = 1;
85 
86             if (array != NULL)
87                 magma_setvector(vec->length, sizeof(array[0]),
88                                 array, 1, impl->darray, 1);
89             break;
90         case CEED_USE_POINTER:
91             ierr = magma_malloc( (void**)&impl->darray,
92                                  vec->length * sizeof(CeedScalar)); CeedChk(ierr);
93             magma_setvector(vec->length, sizeof(array[0]),
94                             array, 1, impl->darray, 1);
95             impl->array  = array;
96         }
97     } else if (mtype == CEED_MEM_DEVICE) {
98         // memory is on the device; own = 0
99         switch (cmode) {
100         case CEED_COPY_VALUES:
101             ierr = magma_malloc( (void**)&impl->darray,
102                                 vec->length * sizeof(CeedScalar)); CeedChk(ierr);
103             ierr = magma_malloc_pinned( (void**)&impl->array,
104                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
105             impl->own_ = 1;
106 
107             if (array)
108                 magma_copyvector(vec->length, sizeof(array[0]),
109                                  array, 1, impl->darray, 1);
110             break;
111         case CEED_OWN_POINTER:
112             impl->darray = array;
113             ierr = magma_malloc_pinned( (void**)&impl->array,
114                                         vec->length * sizeof(CeedScalar)); CeedChk(ierr);
115             impl->own_ = 1;
116 
117             break;
118         case CEED_USE_POINTER:
119             impl->darray = array;
120             impl->array  = NULL;
121         }
122 
123     } else
124         return CeedError(vec->ceed, 1, "Only MemType = HOST or DEVICE supported");
125 
126     return 0;
127 }
128 
129 // *****************************************************************************
130 // * Give data pointer from vector vec to array (on HOST or DEVICE)
131 // *****************************************************************************
132 static int CeedVectorGetArray_Magma(CeedVector vec, CeedMemType mtype,
133                                     CeedScalar **array) {
134     CeedVector_Magma *impl = vec->data;
135     int ierr;
136 
137     if (mtype == CEED_MEM_HOST) {
138         if (impl->own_) {
139             // data is owned so GPU had the most up-to-date version; copy it
140             magma_getvector(vec->length, sizeof(*array[0]),
141                             impl->darray, 1, impl->array, 1);
142         } else if (impl->array == NULL) {
143             // Vector doesn't own the data and was set on GPU
144             if (impl->darray == NULL) {
145                 // call was made just to allocate memory
146                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
147                 CeedChk(ierr);
148             } else
149                 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
150         }
151         *array = impl->array;
152     } else if (mtype == CEED_MEM_DEVICE) {
153         if (impl->darray == NULL){
154             // Vector doesn't own the data and was set on the CPU
155             if (impl->array == NULL) {
156                 // call was made just to allocate memory
157                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
158                 CeedChk(ierr);
159             } else
160                 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
161         }
162         *array = impl->darray;
163     } else
164         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
165 
166     return 0;
167 }
168 
169 // *****************************************************************************
170 // * Give data pointer from vector vec to array (on HOST or DEVICE) to read it
171 // *****************************************************************************
172 static int CeedVectorGetArrayRead_Magma(CeedVector vec, CeedMemType mtype,
173                                         const CeedScalar **array) {
174     CeedVector_Magma *impl = vec->data;
175     int ierr;
176 
177     if (mtype == CEED_MEM_HOST) {
178         if (impl->own_) {
179             // data is owned so GPU had the most up-to-date version; copy it
180             magma_getvector(vec->length, sizeof(*array[0]),
181                             impl->darray, 1, impl->array, 1);
182         } else if (impl->array == NULL) {
183             // Vector doesn't own the data and was set on GPU
184             if (impl->darray == NULL) {
185                 // call was made just to allocate memory
186                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
187                 CeedChk(ierr);
188             } else
189                 return CeedError(vec->ceed, 1, "Can not access DEVICE vector on HOST");
190         }
191         *array = impl->array;
192     } else if (mtype == CEED_MEM_DEVICE) {
193         if (impl->darray == NULL){
194             // Vector doesn't own the data and was set on the CPU
195             if (impl->array == NULL) {
196                 // call was made just to allocate memory
197                 ierr = CeedVectorSetArray(vec, mtype, CEED_COPY_VALUES, NULL);
198                 CeedChk(ierr);
199             } else
200                 return CeedError(vec->ceed, 1, "Can not access HOST vector on DEVICE");
201         }
202         *array = impl->darray;
203     } else
204         return CeedError(vec->ceed, 1, "Can only provide to HOST or DEVICE memory");
205 
206     return 0;
207 }
208 
209 // *****************************************************************************
210 // * There is no mtype here for array so it is not clear if we restore from HOST
211 // * memory or from DEVICE memory. We assume that it is CPU memory because if
212 // * it was GPU memory we would not call this routine at all.
213 // * Restore vector vec with values from array, where array received its values
214 // * from vec and possibly modified them.
215 // *****************************************************************************
216 static int CeedVectorRestoreArray_Magma(CeedVector vec, CeedScalar **array) {
217     CeedVector_Magma *impl = vec->data;
218 
219     // Check if the array is a CPU pointer
220     if (*array == impl->array) {
221         // Update device, if the device pointer is not NULL
222         if (impl->darray != NULL) {
223             magma_setvector(vec->length, sizeof(*array[0]),
224                             *array, 1, impl->darray, 1);
225         } else
226             ; // nothing to do (case of CPU use pointer)
227 
228     } else if (impl->down_) {
229         // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
230         magma_getvector(vec->length, sizeof(*array[0]),
231                         impl->darray, 1, impl->array, 1);
232     }
233 
234     *array = NULL;
235     return 0;
236 }
237 
238 // *****************************************************************************
239 // * There is no mtype here for array so it is not clear if we restore from HOST
240 // * memory or from DEVICE memory. We assume that it is CPU memory because if
241 // * it was GPU memory we would not call this routine at all.
242 // * Restore vector vec with values from array, where array received its values
243 // * from vec to only read them; in this case vec may have been modified meanwhile
244 // * and needs to be restored here.
245 // *****************************************************************************
246 static int CeedVectorRestoreArrayRead_Magma(CeedVector vec,
247                                             const CeedScalar **array) {
248     CeedVector_Magma *impl = vec->data;
249 
250     // Check if the array is a CPU pointer
251     if (*array == impl->array) {
252         // Update device, if the device pointer is not NULL
253         if (impl->darray != NULL)
254             magma_setvector(vec->length, sizeof(*array[0]),
255                             *array, 1, impl->darray, 1);
256         else
257             ; // nothing to do (case of CPU use pointer)
258 
259     } else if (impl->down_) {
260         // nothing to do if array is on GPU, except if down_=1(case CPU use pointer)
261         magma_getvector(vec->length, sizeof(*array[0]),
262                         impl->darray, 1, impl->array, 1);
263     }
264 
265     *array = NULL;
266     return 0;
267 }
268 
269 static int CeedVectorDestroy_Magma(CeedVector vec) {
270     CeedVector_Magma *impl = vec->data;
271     int ierr;
272 
273     // Free if we own the data
274     if (impl->own_){
275         ierr = magma_free_pinned(impl->array); CeedChk(ierr);
276         ierr = magma_free(impl->darray);       CeedChk(ierr);
277     } else if (impl->down_){
278         ierr = magma_free(impl->darray);       CeedChk(ierr);
279     }
280     ierr = CeedFree(&vec->data); CeedChk(ierr);
281     return 0;
282 }
283 
284 // *****************************************************************************
285 // * Create vector vec of size n
286 // *****************************************************************************
287 static int CeedVectorCreate_Magma(Ceed ceed, CeedInt n, CeedVector vec) {
288     CeedVector_Magma *impl;
289     int ierr;
290 
291     vec->SetArray = CeedVectorSetArray_Magma;
292     vec->GetArray = CeedVectorGetArray_Magma;
293     vec->GetArrayRead = CeedVectorGetArrayRead_Magma;
294     vec->RestoreArray = CeedVectorRestoreArray_Magma;
295     vec->RestoreArrayRead = CeedVectorRestoreArrayRead_Magma;
296     vec->Destroy = CeedVectorDestroy_Magma;
297     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
298     impl->darray = NULL;
299     impl->array  = NULL;
300     impl->own_ = 0;
301     impl->down_= 0;
302     vec->data = impl;
303     return 0;
304 }
305 
306 // *****************************************************************************
307 // * Apply restriction operator r to u: v = r(rmode) u
308 // *****************************************************************************
309 static int CeedElemRestrictionApply_Magma(CeedElemRestriction r,
310                                           CeedTransposeMode tmode, CeedInt ncomp,
311                                           CeedTransposeMode lmode, CeedVector u,
312                                           CeedVector v, CeedRequest *request) {
313     CeedElemRestriction_Magma *impl = r->data;
314     int ierr;
315     const CeedScalar *uu;
316     CeedScalar *vv;
317     CeedInt nelem = r->nelem, elemsize = r->elemsize, ndof = r->ndof;
318     CeedInt esize = nelem * elemsize;
319     // CeedInt *indices = impl->indices;
320     CeedInt *dindices = impl->dindices;
321 
322     //ierr = CeedVectorGetArrayRead(u, CEED_MEM_HOST, &uu); CeedChk(ierr);
323     //ierr = CeedVectorGetArray(v, CEED_MEM_HOST, &vv); CeedChk(ierr);
324 
325     // Get pointers on the device
326     ierr = CeedVectorGetArrayRead(u, CEED_MEM_DEVICE, &uu); CeedChk(ierr);
327     ierr = CeedVectorGetArray(v, CEED_MEM_DEVICE, &vv); CeedChk(ierr);
328 
329     if (tmode == CEED_NOTRANSPOSE) {
330         // Perform: v = r * u
331         if (ncomp == 1) {
332             //for (CeedInt i=0; i<esize; i++) vv[i] = uu[indices[i]];
333 
334             magma_template<<i=0:esize>>
335                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices){
336                 vv[i] = uu[dindices[i]];
337             }
338         } else {
339             // vv is (elemsize x ncomp x nelem), column-major
340             if (lmode == CEED_NOTRANSPOSE) { // u is (ndof x ncomp), column-major
341                 /*
342                 for (CeedInt e = 0; e < nelem; e++)
343                     for (CeedInt d = 0; d < ncomp; d++)
344                         for (CeedInt i=0; i < elemsize; i++) {
345                             vv[i + elemsize*(d+ncomp*e)] =
346                                 uu[indices[i+elemsize*e]+ndof*d];
347                         }
348                 */
349                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
350                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, int ndof) {
351                     vv[i + iend*(d+dend*e)] = uu[dindices[i+iend*e]+ndof*d];
352                 }
353             } else { // u is (ncomp x ndof), column-major
354                 /*
355                 for (CeedInt e = 0; e < nelem; e++)
356                     for (CeedInt d = 0; d < ncomp; d++)
357                         for (CeedInt i=0; i< elemsize; i++) {
358                             vv[i + elemsize*(d+ncomp*e)] =
359                                 uu[d+ncomp*indices[i+elemsize*e]];
360                         }
361                 */
362                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
363                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
364                     vv[i + iend*(d+dend*e)] = uu[d+dend*dindices[i + iend*e]];
365                 }
366             }
367         }
368     } else {
369         // Note: in transpose mode, we perform: v += r^t * u
370         if (ncomp == 1) {
371             // fprintf(stderr,"3 ---------\n");
372             // for (CeedInt i=0; i<esize; i++) vv[indices[i]] += uu[i];
373             magma_template<<i=0:esize>>
374                 (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
375                 magmablas_datomic_add( &vv[dindices[i]], uu[i]);
376             }
377         } else { // u is (elemsize x ncomp x nelem)
378             fprintf(stderr,"2 ---------\n");
379 
380             if (lmode == CEED_NOTRANSPOSE) { // vv is (ndof x ncomp), column-major
381                 /*
382                 for (CeedInt e = 0; e < nelem; e++)
383                     for (CeedInt d = 0; d < ncomp; d++)
384                         for (CeedInt i=0; i < elemsize; i++) {
385                             vv[indices[i + elemsize*e]+ndof*d] +=
386                                 uu[i + elemsize*(d+e*ncomp)];
387                         }
388                 */
389                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
390                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices, CeedInt ndof) {
391                     magmablas_datomic_add( &vv[dindices[i+iend*e]+ndof*d],
392                                            uu[i+iend*(d+e*dend)]);
393                 }
394             } else { // vv is (ncomp x ndof), column-major
395                 /*
396                 for (CeedInt e = 0; e < nelem; e++)
397                     for (CeedInt d = 0; d < ncomp; d++)
398                         for (CeedInt i=0; i < elemsize; i++) {
399                             vv[d+ncomp*indices[i + elemsize*e]] +=
400                                 uu[i + elemsize*(d+e*ncomp)];
401                         }
402                 */
403                 magma_template<<e=0:nelem, d=0:ncomp, i=0:elemsize>>
404                     (const CeedScalar *uu, CeedScalar *vv, CeedInt *dindices) {
405                     magmablas_datomic_add( &vv[d+dend*dindices[i + iend*e]],
406                                            uu[i+iend*(d+e*dend)]);
407                 }
408             }
409         }
410     }
411     ierr = CeedVectorRestoreArrayRead(u, &uu); CeedChk(ierr);
412     ierr = CeedVectorRestoreArray(v, &vv); CeedChk(ierr);
413 
414     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
415         *request = NULL;
416     return 0;
417 }
418 
419 static int CeedElemRestrictionDestroy_Magma(CeedElemRestriction r) {
420   CeedElemRestriction_Magma *impl = r->data;
421   int ierr;
422 
423   // Free if we own the data
424   if (impl->own_){
425       ierr = magma_free_pinned(impl->indices); CeedChk(ierr);
426       ierr = magma_free(impl->dindices);       CeedChk(ierr);
427   }
428   else if (impl->down_){
429       ierr = magma_free(impl->dindices);       CeedChk(ierr);
430   }
431   ierr = CeedFree(&r->data); CeedChk(ierr);
432   return 0;
433 }
434 
435 static int CeedElemRestrictionCreate_Magma(CeedElemRestriction r,
436                                            CeedMemType mtype,
437                                            CeedCopyMode cmode,
438                                            const CeedInt *indices) {
439     int ierr, size = r->nelem*r->elemsize;
440     CeedElemRestriction_Magma *impl;
441 
442     // Allocate memory for the MAGMA Restricton and initializa pointers to NULL
443     ierr = CeedCalloc(1,&impl); CeedChk(ierr);
444     impl->dindices = NULL;
445     impl->indices  = NULL;
446     impl->own_ = 0;
447     impl->down_= 0;
448 
449     if (mtype == CEED_MEM_HOST) {
450         // memory is on the host; own_ = 0
451         switch (cmode) {
452         case CEED_COPY_VALUES:
453             ierr = magma_malloc( (void**)&impl->dindices,
454                                   size * sizeof(CeedInt)); CeedChk(ierr);
455             ierr = magma_malloc_pinned( (void**)&impl->indices,
456                                         size * sizeof(CeedInt)); CeedChk(ierr);
457             impl->own_ = 1;
458 
459             if (indices != NULL)
460                 magma_setvector(size, sizeof(CeedInt),
461                                 indices, 1, impl->dindices, 1);
462             break;
463         case CEED_OWN_POINTER:
464             ierr = magma_malloc( (void**)&impl->dindices,
465                                  size * sizeof(CeedInt)); CeedChk(ierr);
466             // TODO: possible problem here is if we are passed non-pinned memory;
467             //       (as we own it, lter in destroy, we use free for pinned memory).
468             impl->indices = (CeedInt *)indices;
469             impl->own_ = 1;
470 
471             if (indices != NULL)
472                 magma_setvector(size, sizeof(CeedInt),
473                                 indices, 1, impl->dindices, 1);
474             break;
475         case CEED_USE_POINTER:
476             ierr = magma_malloc( (void**)&impl->dindices,
477                                  size * sizeof(CeedInt)); CeedChk(ierr);
478             magma_setvector(size, sizeof(CeedInt),
479                             indices, 1, impl->dindices, 1);
480             impl->down_ = 1;
481             impl->indices  = (CeedInt *)indices;
482         }
483     } else if (mtype == CEED_MEM_DEVICE) {
484         // memory is on the device; own = 0
485         switch (cmode) {
486         case CEED_COPY_VALUES:
487             ierr = magma_malloc( (void**)&impl->dindices,
488                                  size * sizeof(CeedInt)); CeedChk(ierr);
489             ierr = magma_malloc_pinned( (void**)&impl->indices,
490                                         size * sizeof(CeedInt)); CeedChk(ierr);
491             impl->own_ = 1;
492 
493             if (indices)
494                 magma_copyvector(size, sizeof(CeedInt),
495                                  indices, 1, impl->dindices, 1);
496             break;
497         case CEED_OWN_POINTER:
498             impl->dindices = (CeedInt *)indices;
499             ierr = magma_malloc_pinned( (void**)&impl->indices,
500                                         size * sizeof(CeedInt)); CeedChk(ierr);
501             impl->own_ = 1;
502 
503             break;
504         case CEED_USE_POINTER:
505             impl->dindices = (CeedInt *)indices;
506             impl->indices  = NULL;
507         }
508 
509     } else
510         return CeedError(r->ceed, 1, "Only MemType = HOST or DEVICE supported");
511 
512     r->data    = impl;
513     r->Apply   = CeedElemRestrictionApply_Magma;
514     r->Destroy = CeedElemRestrictionDestroy_Magma;
515 
516     return 0;
517 }
518 
519 // Contracts on the middle index
520 // NOTRANSPOSE: V_ajc = T_jb U_abc
521 // TRANSPOSE:   V_ajc = T_bj U_abc
522 // If Add != 0, "=" is replaced by "+="
523 static int CeedTensorContract_Magma(Ceed ceed,
524                                     CeedInt A, CeedInt B, CeedInt C, CeedInt J,
525                                     const CeedScalar *t, CeedTransposeMode tmode,
526                                     const CeedInt Add,
527                                     const CeedScalar *u, CeedScalar *v) {
528     #ifdef USE_MAGMA_BATCH
529     magma_dtensor_contract(ceed, A, B, C, J, t, tmode, Add, u, v);
530     #else
531     CeedInt tstride0 = B, tstride1 = 1;
532     if (tmode == CEED_TRANSPOSE) {
533         tstride0 = 1; tstride1 = J;
534     }
535     CeedDebug("\033[31m[CeedTensorContract] A=%d, J=%d, C=%d, B=%d: %d %d %d",
536               A,J,C,B,A*J*B*C, C*J*A, C*B*A);
537     for (CeedInt a=0; a<A; a++) {
538         for (CeedInt j=0; j<J; j++) {
539             if (!Add) {
540                 for (CeedInt c=0; c<C; c++)
541                     v[(a*J+j)*C+c] = 0;
542             }
543             for (CeedInt b=0; b<B; b++) {
544                 for (CeedInt c=0; c<C; c++) {
545                     v[(a*J+j)*C+c] += t[j*tstride0 + b*tstride1] * u[(a*B+b)*C+c];
546                 }
547             }
548         }
549     }
550     #endif
551     return 0;
552 }
553 
554 static int CeedBasisApply_Magma(CeedBasis basis, CeedTransposeMode tmode,
555                                 CeedEvalMode emode,
556                                 const CeedScalar *u, CeedScalar *v) {
557     int ierr;
558     const CeedInt dim = basis->dim;
559     const CeedInt ndof = basis->ndof;
560     const CeedInt nqpt = ndof*CeedPowInt(basis->Q1d, dim);
561     const CeedInt add = (tmode == CEED_TRANSPOSE);
562 
563     CeedDebug("\033[01m[CeedBasisApply_Magma] vsize=%d",
564               ndof*CeedPowInt(basis->P1d, dim));
565 
566     if (tmode == CEED_TRANSPOSE) {
567         const CeedInt vsize = ndof*CeedPowInt(basis->P1d, dim);
568         for (CeedInt i = 0; i < vsize; i++)
569             v[i] = (CeedScalar) 0;
570     }
571     if (emode & CEED_EVAL_INTERP) {
572         CeedInt P = basis->P1d, Q = basis->Q1d;
573         if (tmode == CEED_TRANSPOSE) {
574             P = basis->Q1d; Q = basis->P1d;
575         }
576         CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
577         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
578         CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
579                   ndof*Q*CeedPowInt(P>Q?P:Q, dim-1));
580         for (CeedInt d=0; d<dim; d++) {
581             ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q, basis->interp1d,
582                                             tmode, add&&(d==dim-1),
583                                             d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
584             CeedChk(ierr);
585             pre /= P;
586             post *= Q;
587         }
588         if (tmode == CEED_NOTRANSPOSE) {
589             v += nqpt;
590         } else {
591             u += nqpt;
592         }
593     }
594     if (emode & CEED_EVAL_GRAD) {
595         CeedInt P = basis->P1d, Q = basis->Q1d;
596         // In CEED_NOTRANSPOSE mode:
597         // u is (P^dim x nc), column-major layout (nc = ndof)
598         // v is (Q^dim x nc x dim), column-major layout (nc = ndof)
599         // In CEED_TRANSPOSE mode, the sizes of u and v are switched.
600         if (tmode == CEED_TRANSPOSE) {
601             P = basis->Q1d, Q = basis->P1d;
602         }
603         CeedScalar tmp[2][ndof*Q*CeedPowInt(P>Q?P:Q, dim-1)];
604         CeedDebug("\033[01m[CeedBasisApply_Magma] tmpsize = %d",
605                   ndof*Q*CeedPowInt(P>Q?P:Q, dim-1));
606         for (CeedInt p = 0; p < dim; p++) {
607             CeedInt pre = ndof*CeedPowInt(P, dim-1), post = 1;
608             for (CeedInt d=0; d<dim; d++) {
609                 ierr = CeedTensorContract_Magma(basis->ceed, pre, P, post, Q,
610                                                 (p==d)?basis->grad1d:basis->interp1d,
611                                                 tmode, add&&(d==dim-1),
612                                                 d==0?u:tmp[d%2], d==dim-1?v:tmp[(d+1)%2]);
613                 CeedChk(ierr);
614                 pre /= P;
615                 post *= Q;
616             }
617             if (tmode == CEED_NOTRANSPOSE) {
618                 v += nqpt;
619             } else {
620                 u += nqpt;
621             }
622         }
623     }
624     if (emode & CEED_EVAL_WEIGHT) {
625         if (tmode == CEED_TRANSPOSE)
626             return CeedError(basis->ceed, 1,
627                              "CEED_EVAL_WEIGHT incompatible with CEED_TRANSPOSE");
628         CeedInt Q = basis->Q1d;
629         for (CeedInt d=0; d<dim; d++) {
630             CeedInt pre = CeedPowInt(Q, dim-d-1), post = CeedPowInt(Q, d);
631             for (CeedInt i=0; i<pre; i++) {
632                 for (CeedInt j=0; j<Q; j++) {
633                     for (CeedInt k=0; k<post; k++) {
634                         v[(i*Q + j)*post + k] = basis->qweight1d[j]
635                             * (d == 0 ? 1 : v[(i*Q + j)*post + k]);
636                     }
637                 }
638             }
639         }
640     }
641     return 0;
642 }
643 
644 static int CeedBasisDestroy_Magma(CeedBasis basis) {
645     return 0;
646 }
647 
648 static int CeedBasisCreateTensorH1_Magma(Ceed ceed, CeedInt dim, CeedInt P1d,
649                                          CeedInt Q1d, const CeedScalar *interp1d,
650                                          const CeedScalar *grad1d,
651                                          const CeedScalar *qref1d,
652                                          const CeedScalar *qweight1d,
653                                          CeedBasis basis) {
654     basis->Apply = CeedBasisApply_Magma;
655     basis->Destroy = CeedBasisDestroy_Magma;
656     return 0;
657 }
658 
659 static int CeedQFunctionApply_Magma(CeedQFunction qf, void *qdata, CeedInt Q,
660                                     const CeedScalar *const *u,
661                                     CeedScalar *const *v) {
662     int ierr;
663     ierr = qf->function(qf->ctx, qdata, Q, u, v); CeedChk(ierr);
664     return 0;
665 }
666 
667 static int CeedQFunctionDestroy_Magma(CeedQFunction qf) {
668     return 0;
669 }
670 
671 static int CeedQFunctionCreate_Magma(CeedQFunction qf) {
672     qf->Apply = CeedQFunctionApply_Magma;
673     qf->Destroy = CeedQFunctionDestroy_Magma;
674     return 0;
675 }
676 
677 static int CeedOperatorDestroy_Magma(CeedOperator op) {
678     CeedOperator_Magma *impl = op->data;
679     int ierr;
680 
681     ierr = CeedVectorDestroy(&impl->etmp); CeedChk(ierr);
682     ierr = CeedVectorDestroy(&impl->qdata); CeedChk(ierr);
683     ierr = CeedFree(&op->data); CeedChk(ierr);
684     return 0;
685 }
686 
687 static int CeedOperatorApply_Magma(CeedOperator op, CeedVector qdata,
688                                    CeedVector ustate,
689                                    CeedVector residual, CeedRequest *request) {
690     CeedOperator_Magma *impl = op->data;
691     CeedVector etmp;
692     CeedInt Q;
693     const CeedInt nc = op->basis->ndof, dim = op->basis->dim;
694     CeedScalar *Eu;
695     char *qd;
696     int ierr;
697     CeedTransposeMode lmode = CEED_NOTRANSPOSE;
698 
699     if (!impl->etmp) {
700         ierr = CeedVectorCreate(op->ceed,
701                                 nc * op->Erestrict->nelem * op->Erestrict->elemsize,
702                                 &impl->etmp); CeedChk(ierr);
703         // etmp is allocated when CeedVectorGetArray is called below
704     }
705     etmp = impl->etmp;
706     if (op->qf->inmode & ~CEED_EVAL_WEIGHT) {
707         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_NOTRANSPOSE,
708                                         nc, lmode, ustate, etmp,
709                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
710     }
711     ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
712     ierr = CeedVectorGetArray(etmp, CEED_MEM_HOST, &Eu); CeedChk(ierr);
713     ierr = CeedVectorGetArray(qdata, CEED_MEM_HOST, (CeedScalar**)&qd);
714     CeedChk(ierr);
715 
716     for (CeedInt e=0; e<op->Erestrict->nelem; e++) {
717         CeedScalar BEu[Q*nc*(dim+2)], BEv[Q*nc*(dim+2)], *out[5] = {0,0,0,0,0};
718         const CeedScalar *in[5] = {0,0,0,0,0};
719         // TODO: quadrature weights can be computed just once
720         CeedDebug("\033[11m[CeedOperatorApply_Magma] e=%d: Eu+%d, %d",
721                   e, e*op->Erestrict->elemsize*nc, Q*nc*(dim+2));
722         ierr = CeedBasisApply(op->basis, CEED_NOTRANSPOSE, op->qf->inmode,
723                               &Eu[e*op->Erestrict->elemsize*nc], BEu);
724         CeedChk(ierr);
725         CeedScalar *u_ptr = BEu, *v_ptr = BEv;
726         if (op->qf->inmode & CEED_EVAL_INTERP) { in[0] = u_ptr; u_ptr += Q*nc; }
727         if (op->qf->inmode & CEED_EVAL_GRAD) { in[1] = u_ptr; u_ptr += Q*nc*dim; }
728         if (op->qf->inmode & CEED_EVAL_WEIGHT) { in[4] = u_ptr; u_ptr += Q; }
729         if (op->qf->outmode & CEED_EVAL_INTERP) { out[0] = v_ptr; v_ptr += Q*nc; }
730         if (op->qf->outmode & CEED_EVAL_GRAD) { out[1] = v_ptr; v_ptr += Q*nc*dim; }
731         ierr = CeedQFunctionApply(op->qf, &qd[e*Q*op->qf->qdatasize], Q, in, out);
732         CeedChk(ierr);
733         CeedDebug("\033[31m[CeedOperatorApply_Magma] e=%d: ",e);
734         ierr = CeedBasisApply(op->basis, CEED_TRANSPOSE, op->qf->outmode, BEv,
735                               &Eu[e*op->Erestrict->elemsize*nc]);
736         CeedChk(ierr);
737     }
738     ierr = CeedVectorRestoreArray(etmp, &Eu); CeedChk(ierr);
739     if (residual) {
740         CeedScalar *res;
741         CeedVectorGetArray(residual, CEED_MEM_HOST, &res);
742         for (int i = 0; i < residual->length; i++)
743             res[i] = (CeedScalar)0;
744         ierr = CeedElemRestrictionApply(op->Erestrict, CEED_TRANSPOSE,
745                                         nc, lmode, etmp, residual,
746                                         CEED_REQUEST_IMMEDIATE); CeedChk(ierr);
747     }
748     if (request != CEED_REQUEST_IMMEDIATE && request != CEED_REQUEST_ORDERED)
749         *request = NULL;
750     return 0;
751 }
752 
753 static int CeedOperatorGetQData_Magma(CeedOperator op, CeedVector *qdata) {
754     CeedOperator_Magma *impl = op->data;
755     int ierr;
756 
757     if (!impl->qdata) {
758         CeedInt Q;
759         ierr = CeedBasisGetNumQuadraturePoints(op->basis, &Q); CeedChk(ierr);
760         ierr = CeedVectorCreate(op->ceed,
761                                 op->Erestrict->nelem * Q
762                                 * op->qf->qdatasize / sizeof(CeedScalar),
763                                 &impl->qdata); CeedChk(ierr);
764     }
765     *qdata = impl->qdata;
766     return 0;
767 }
768 
769 static int CeedOperatorCreate_Magma(CeedOperator op) {
770     CeedOperator_Magma *impl;
771     int ierr;
772 
773     ierr = CeedCalloc(1, &impl); CeedChk(ierr);
774     op->data = impl;
775     op->Destroy  = CeedOperatorDestroy_Magma;
776     op->Apply    = CeedOperatorApply_Magma;
777     op->GetQData = CeedOperatorGetQData_Magma;
778     return 0;
779 }
780 
781 // *****************************************************************************
782 // * INIT
783 // *****************************************************************************
784 static int CeedInit_Magma(const char *resource, Ceed ceed) {
785     if (strcmp(resource, "/cpu/magma")
786         && strcmp(resource, "/gpu/magma"))
787         return CeedError(ceed, 1, "MAGMA backend cannot use resource: %s", resource);
788 
789     magma_init();
790     //magma_print_environment();
791 
792     ceed->VecCreate = CeedVectorCreate_Magma;
793     ceed->BasisCreateTensorH1 = CeedBasisCreateTensorH1_Magma;
794     ceed->ElemRestrictionCreate = CeedElemRestrictionCreate_Magma;
795     ceed->QFunctionCreate = CeedQFunctionCreate_Magma;
796     ceed->OperatorCreate = CeedOperatorCreate_Magma;
797     return 0;
798 }
799 
800 // *****************************************************************************
801 // * REGISTER
802 // *****************************************************************************
803 __attribute__((constructor))
804 static void Register(void) {
805     CeedRegister("/cpu/magma", CeedInit_Magma);
806     CeedRegister("/gpu/magma", CeedInit_Magma);
807 }
808