xref: /honee/src/smartsim/smartsim.c (revision 797f7eedcc02b679789aff2c819b1ea5d422fc92)
1ae2b091fSJames Wright // SPDX-FileCopyrightText: Copyright (c) 2017-2024, HONEE contributors.
2ae2b091fSJames Wright // SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause
37cd70835SJames Wright // Based on the instructions from https://www.craylabs.org/docs/sr_integration.html and PHASTA implementation
47cd70835SJames Wright 
5149fb536SJames Wright #include <smartsim.h>
67cd70835SJames Wright 
7149fb536SJames Wright #include <navierstokes.h>
87cd70835SJames Wright 
9*797f7eedSJames Wright PetscErrorCode SmartSimDataDestroy(SmartSimData smartsim) {
107cd70835SJames Wright   PetscFunctionBeginUser;
11*797f7eedSJames Wright   if (!smartsim) PetscFunctionReturn(PETSC_SUCCESS);
12*797f7eedSJames Wright 
13*797f7eedSJames Wright   PetscCallSmartRedis(DeleteCClient(&smartsim->client));
14*797f7eedSJames Wright   PetscCall(PetscFree(smartsim));
157cd70835SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
167cd70835SJames Wright }
177cd70835SJames Wright 
18*797f7eedSJames Wright static PetscErrorCode SmartSimTrainingSetup(Honee honee) {
190c373b74SJames Wright   SmartSimData smartsim = honee->smartsim;
207cd70835SJames Wright   PetscMPIInt  rank;
217cd70835SJames Wright   PetscReal    checkrun[2] = {1};
227cd70835SJames Wright   size_t       dim_2[1]    = {2};
237cd70835SJames Wright 
247cd70835SJames Wright   PetscFunctionBeginUser;
250c373b74SJames Wright   PetscCallMPI(MPI_Comm_rank(honee->comm, &rank));
267cd70835SJames Wright 
277cd70835SJames Wright   if (rank % smartsim->collocated_database_num_ranks == 0) {
287cd70835SJames Wright     // -- Send array that communicates when ML is done training
29ea615d4cSJames Wright     PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
3043e9749fSJames Wright     PetscCallSmartRedis(put_tensor(smartsim->client, "check-run", 9, checkrun, dim_2, 1, SRTensorTypeDouble, SRMemLayoutContiguous));
317cd70835SJames Wright     PetscCall(SmartRedisVerifyPutTensor(smartsim->client, "check-run", 9));
32ea615d4cSJames Wright     PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
337cd70835SJames Wright   }
34aa0b7f76SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
357cd70835SJames Wright }
367cd70835SJames Wright 
370c373b74SJames Wright PetscErrorCode SmartSimSetup(Honee honee) {
387cd70835SJames Wright   PetscMPIInt rank;
397cd70835SJames Wright   PetscInt    num_orchestrator_nodes = 1;
407cd70835SJames Wright 
417cd70835SJames Wright   PetscFunctionBeginUser;
420c373b74SJames Wright   PetscCall(PetscNew(&honee->smartsim));
430c373b74SJames Wright   SmartSimData smartsim = honee->smartsim;
447cd70835SJames Wright 
457cd70835SJames Wright   smartsim->collocated_database_num_ranks = 1;
460c373b74SJames Wright   PetscOptionsBegin(honee->comm, NULL, "Options for SmartSim integration", NULL);
477cd70835SJames Wright   PetscCall(PetscOptionsInt("-smartsim_collocated_database_num_ranks", "Number of ranks per collocated database instance", NULL,
487cd70835SJames Wright                             smartsim->collocated_database_num_ranks, &smartsim->collocated_database_num_ranks, NULL));
497cd70835SJames Wright   PetscOptionsEnd();
507cd70835SJames Wright 
517cd70835SJames Wright   // Create prefix to be put on tensor names
520c373b74SJames Wright   PetscCallMPI(MPI_Comm_rank(honee->comm, &rank));
534fa1625aSJames Wright   PetscCall(PetscSNPrintf(smartsim->rank_id_name, sizeof(smartsim->rank_id_name), "y.%d", rank));
547cd70835SJames Wright 
55ea615d4cSJames Wright   PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Init, 0, 0, 0, 0));
5643e9749fSJames Wright   PetscCallSmartRedis(SmartRedisCClient(num_orchestrator_nodes != 1, smartsim->rank_id_name, strlen(smartsim->rank_id_name), &smartsim->client));
57ea615d4cSJames Wright   PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Init, 0, 0, 0, 0));
587cd70835SJames Wright 
590c373b74SJames Wright   PetscCall(SmartSimTrainingSetup(honee));
607cd70835SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
617cd70835SJames Wright }
62ec6e4151SJames Wright 
63*797f7eedSJames Wright PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length) {
64*797f7eedSJames Wright   bool does_exist = true;
65ec6e4151SJames Wright 
66*797f7eedSJames Wright   PetscFunctionBeginUser;
67*797f7eedSJames Wright   PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
68*797f7eedSJames Wright   PetscCallSmartRedis(tensor_exists(c_client, name, name_length, &does_exist));
69*797f7eedSJames Wright   PetscCheck(does_exist, PETSC_COMM_SELF, -1, "Tensor of name '%s' was not written to the database successfully", name);
70*797f7eedSJames Wright   PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Meta, 0, 0, 0, 0));
71ec6e4151SJames Wright   PetscFunctionReturn(PETSC_SUCCESS);
72ec6e4151SJames Wright }
73