1ae2b091fSJames Wright // SPDX-FileCopyrightText: Copyright (c) 2017-2024, HONEE contributors. 2ae2b091fSJames Wright // SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause 37cd70835SJames Wright // Based on the instructions from https://www.craylabs.org/docs/sr_integration.html and PHASTA implementation 47cd70835SJames Wright 59ae013d6SJames Wright #include <smartsim-impl.h> 67cd70835SJames Wright 7149fb536SJames Wright #include <navierstokes.h> 87cd70835SJames Wright 97ebeccb9SJames Wright #define SMARTSIM_KEY "SmartSimData" 10797f7eedSJames Wright 117ebeccb9SJames Wright static PetscErrorCode SmartSimDataDestroy(SmartSimData *smartsim) { 127ebeccb9SJames Wright SmartSimData smartsim_ = *smartsim; 13*14bd2a07SJames Wright 147ebeccb9SJames Wright PetscFunctionBeginUser; 157ebeccb9SJames Wright if (!smartsim_) PetscFunctionReturn(PETSC_SUCCESS); 167ebeccb9SJames Wright 177ebeccb9SJames Wright PetscCallSmartRedis(DeleteCClient(&smartsim_->client)); 187ebeccb9SJames Wright PetscCall(PetscFree(smartsim_)); 197ebeccb9SJames Wright *smartsim = NULL; 207cd70835SJames Wright PetscFunctionReturn(PETSC_SUCCESS); 217cd70835SJames Wright } 227cd70835SJames Wright 23797f7eedSJames Wright static PetscErrorCode SmartSimTrainingSetup(Honee honee) { 247ebeccb9SJames Wright SmartSimData smartsim; 257cd70835SJames Wright PetscMPIInt rank; 267cd70835SJames Wright PetscReal checkrun[2] = {1}; 277cd70835SJames Wright size_t dim_2[1] = {2}; 287cd70835SJames Wright 297cd70835SJames Wright PetscFunctionBeginUser; 307ebeccb9SJames Wright PetscCall(HoneeGetSmartSimData(honee, &smartsim)); 310c373b74SJames Wright PetscCallMPI(MPI_Comm_rank(honee->comm, &rank)); 327cd70835SJames Wright 337cd70835SJames Wright if (rank % smartsim->collocated_database_num_ranks == 0) { 347cd70835SJames Wright // -- Send array that communicates when ML is done training 35ea615d4cSJames Wright PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Meta, 0, 0, 0, 0)); 3643e9749fSJames Wright PetscCallSmartRedis(put_tensor(smartsim->client, "check-run", 9, checkrun, dim_2, 1, SRTensorTypeDouble, SRMemLayoutContiguous)); 377cd70835SJames Wright PetscCall(SmartRedisVerifyPutTensor(smartsim->client, "check-run", 9)); 38ea615d4cSJames Wright PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Meta, 0, 0, 0, 0)); 397cd70835SJames Wright } 40aa0b7f76SJames Wright PetscFunctionReturn(PETSC_SUCCESS); 417cd70835SJames Wright } 427cd70835SJames Wright 437ebeccb9SJames Wright static PetscErrorCode SmartSimSetup(Honee honee) { 447cd70835SJames Wright PetscMPIInt rank; 457cd70835SJames Wright PetscInt num_orchestrator_nodes = 1; 467ebeccb9SJames Wright SmartSimData smartsim; 477cd70835SJames Wright 487cd70835SJames Wright PetscFunctionBeginUser; 497ebeccb9SJames Wright PetscCall(PetscNew(&smartsim)); 507cd70835SJames Wright 517cd70835SJames Wright smartsim->collocated_database_num_ranks = 1; 520c373b74SJames Wright PetscOptionsBegin(honee->comm, NULL, "Options for SmartSim integration", NULL); 537cd70835SJames Wright PetscCall(PetscOptionsInt("-smartsim_collocated_database_num_ranks", "Number of ranks per collocated database instance", NULL, 547cd70835SJames Wright smartsim->collocated_database_num_ranks, &smartsim->collocated_database_num_ranks, NULL)); 557cd70835SJames Wright PetscOptionsEnd(); 567cd70835SJames Wright 577cd70835SJames Wright // Create prefix to be put on tensor names 580c373b74SJames Wright PetscCallMPI(MPI_Comm_rank(honee->comm, &rank)); 594fa1625aSJames Wright PetscCall(PetscSNPrintf(smartsim->rank_id_name, sizeof(smartsim->rank_id_name), "y.%d", rank)); 607cd70835SJames Wright 61ea615d4cSJames Wright PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Init, 0, 0, 0, 0)); 6243e9749fSJames Wright PetscCallSmartRedis(SmartRedisCClient(num_orchestrator_nodes != 1, smartsim->rank_id_name, strlen(smartsim->rank_id_name), &smartsim->client)); 63ea615d4cSJames Wright PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Init, 0, 0, 0, 0)); 647cd70835SJames Wright 650c70a8bcSJames Wright PetscCall(HoneeSetContainer(honee, SMARTSIM_KEY, smartsim, (PetscCtxDestroyFn *)SmartSimDataDestroy)); 667ebeccb9SJames Wright 670c373b74SJames Wright PetscCall(SmartSimTrainingSetup(honee)); 687cd70835SJames Wright PetscFunctionReturn(PETSC_SUCCESS); 697cd70835SJames Wright } 70ec6e4151SJames Wright 718fc6ab98SJames Wright /** 728fc6ab98SJames Wright @brief Obtains the `SmartSimData` from the `Honee` object 738fc6ab98SJames Wright 748fc6ab98SJames Wright If `SmartSimData` has not already been initialized, this will initialize and create the struct. 758fc6ab98SJames Wright 768fc6ab98SJames Wright @param[in] honee `Honee` object containing the SmartSim data 778fc6ab98SJames Wright @param[out] smartsim `SmartSimData` containing the data 788fc6ab98SJames Wright **/ 797ebeccb9SJames Wright PetscErrorCode HoneeGetSmartSimData(Honee honee, SmartSimData *smartsim) { 800c70a8bcSJames Wright PetscBool has_smartsim; 810c70a8bcSJames Wright 827ebeccb9SJames Wright PetscFunctionBeginUser; 830c70a8bcSJames Wright PetscCall(HoneeHasContainer(honee, SMARTSIM_KEY, &has_smartsim)); 840c70a8bcSJames Wright if (!has_smartsim) PetscCall(SmartSimSetup(honee)); 850c70a8bcSJames Wright PetscCall(HoneeGetContainer(honee, SMARTSIM_KEY, smartsim)); 867ebeccb9SJames Wright PetscFunctionReturn(PETSC_SUCCESS); 877ebeccb9SJames Wright } 887ebeccb9SJames Wright 898fc6ab98SJames Wright /** 908fc6ab98SJames Wright @brief Checks if a tensor with `name` is in the SmartRedis database 918fc6ab98SJames Wright 928fc6ab98SJames Wright Function will error out if tensor does not exist. 938fc6ab98SJames Wright 948fc6ab98SJames Wright @param[in] c_client SmartRedis client object 958fc6ab98SJames Wright @param[in] name Name of the tensor 968fc6ab98SJames Wright @param[in] name_length Length of the tensor name 978fc6ab98SJames Wright @return An error code: 0 - success, otherwise - failure 988fc6ab98SJames Wright **/ 99797f7eedSJames Wright PetscErrorCode SmartRedisVerifyPutTensor(void *c_client, const char *name, const size_t name_length) { 100797f7eedSJames Wright bool does_exist = true; 101ec6e4151SJames Wright 102797f7eedSJames Wright PetscFunctionBeginUser; 103797f7eedSJames Wright PetscCall(PetscLogEventBegin(HONEE_SmartRedis_Meta, 0, 0, 0, 0)); 104797f7eedSJames Wright PetscCallSmartRedis(tensor_exists(c_client, name, name_length, &does_exist)); 105797f7eedSJames Wright PetscCheck(does_exist, PETSC_COMM_SELF, -1, "Tensor of name '%s' was not written to the database successfully", name); 106797f7eedSJames Wright PetscCall(PetscLogEventEnd(HONEE_SmartRedis_Meta, 0, 0, 0, 0)); 107ec6e4151SJames Wright PetscFunctionReturn(PETSC_SUCCESS); 108ec6e4151SJames Wright } 109