Source code for nmdc_ms_metadata_gen.biosample_generator

import os
from pathlib import Path

import nmdc_schema.nmdc as nmdc
import pandas as pd
import toml

import nmdc_ms_metadata_gen
from nmdc_ms_metadata_gen.data_classes import NmdcTypes
from nmdc_ms_metadata_gen.id_pool import IDPool
from nmdc_ms_metadata_gen.metadata_generator import NMDCMetadataGenerator
from nmdc_ms_metadata_gen.metadata_parser import BiosampleMetadataParser



[docs]
class BiosampleGenerator(NMDCMetadataGenerator):
    """
    Class to handle biosample generation for the NMDC database.

    Parameters
    ----------
    metadata_file : str
        Path to the metadata CSV file containing biosample information.
    database_dump_json_path : str
        Path to output the generated NMDC database JSON file.
    minting_config_creds : str, optional
        Path to the configuration file containing credentials for minting biosample IDs.
    id_pool_size : int, optional
        The size of the ID pool to maintain for minting biosample IDs. Default is 50.
    id_refill_threshold : int, optional
        The threshold at which to refill the ID pool. Default is 10.
    test : bool, optional
        Flag indicating whether to run in test mode. If True, will use local IDs (skip API minting). Default is False.
    """

    def __init__(
        self,
        metadata_file: str,
        database_dump_json_path: str,
        minting_config_creds: str = None,
        id_pool_size: int = 50,
        id_refill_threshold: int = 10,
        test: bool = False,
    ):
        # Initialize superclass with ID pool parameters
        super().__init__(
            id_pool_size=id_pool_size,
            id_refill_threshold=id_refill_threshold,
            test=test,
        )

        # Add class-specific attributes
        self.metadata_file = metadata_file
        self.database_dump_json_path = database_dump_json_path
        self.minting_config_creds = minting_config_creds


[docs]
    def run(self) -> dict:
        """
        Main method to run the biosample generation process.

        Returns
        -------
            The generated NMDC database instance containing all generated biosample records as a dictionary.
        """
        # load file
        try:
            metadata_df = pd.read_csv(self.metadata_file)
        except FileNotFoundError:
            raise FileNotFoundError(f"Metadata file not found: {self.metadata_file}")
        # Start NMDC database and make metadata dataframe
        nmdc_database_inst = self.start_nmdc_database()
        # load credentials
        client_id, client_secret = self.load_credentials(
            config_file=self.minting_config_creds
        )
        bio_api_key = self.load_bio_credentials(config_file=self.minting_config_creds)

        self.check_biosample_rows(
            metadata_df=metadata_df,
            nmdc_database_inst=nmdc_database_inst,
            BIO_API_KEY=bio_api_key,
            CLIENT_ID=client_id,
            CLIENT_SECRET=client_secret,
        )
        self.dump_nmdc_database(
            nmdc_database=nmdc_database_inst, json_path=self.database_dump_json_path
        )
        # change db object to dict
        return self.nmdc_db_to_dict(nmdc_database_inst)



[docs]
    def check_biosample_rows(
        self,
        metadata_df: pd.DataFrame,
        nmdc_database_inst: nmdc.Database,
        CLIENT_ID: str,
        CLIENT_SECRET: str,
        BIO_API_KEY: str,
    ) -> None:
        """
        This method verifies the presence of the 'biosample_id' in the provided metadata DataFrame. It will loop over each row to verify the presence of the 'biosample_id', giving the option for some rows to need generation and some to already exist.
        It checks for the presence of required columns to generate a new biosample_id using the NMDC API. If they are all there, the function calls the dynam_parse_biosample_metadata method from the MetadataParser class to create the JSON for the biosample.
        If the required columns are missing it raises a ValueError.
        After the biosample_id is generated, it updates the DataFrame row with the newly minted biosample_id and the NMDC database instance with the new biosample JSON.

        Parameters
        ----------
        metadata_df : pd.DataFrame
            the dataframe containing the metadata information.
        nmdc_database_inst : nmdc.Database
            The NMDC Database instance to add the biosample to.
        CLIENT_ID : str
            The client ID for the NMDC API. Used to mint a biosmaple id.
        CLIENT_SECRET : str
            The client secret for the NMDC API. Used to mint a biosmaple id.

        Returns
        -------
        None

        Raises
        ------
        ValueError
            If the 'biosample.name' column is missing and 'biosample_id' is empty.
            If any required columns for biosample generation are missing.

        """
        parser = BiosampleMetadataParser()

        if "biosample.name" not in metadata_df.columns:
            raise ValueError(
                "The 'biosample.name' column is required to create biosamples."
            )
        rows = metadata_df.groupby("biosample.name")

        for _, group in rows:
            row = group.iloc[0]
            if pd.isnull(row.get("biosample_id")):
                required_columns = [
                    "biosample.name",
                    "biosample.associated_studies",
                    "biosample.env_broad_scale",
                    "biosample.env_local_scale",
                    "biosample.env_medium",
                ]
                # Check for the existence of all required columns
                missing_columns = [
                    col for col in required_columns if col not in metadata_df.columns
                ]
                if missing_columns:
                    raise ValueError(
                        f"The following required columns are missing from the DataFrame: {', '.join(missing_columns)}"
                    )

                # Generate biosamples
                biosample_metadata = parser.dynam_parse_biosample_metadata(
                    row=row, bio_api_key=BIO_API_KEY
                )
                biosample = self.generate_biosample(
                    biosamp_metadata=biosample_metadata,
                    CLIENT_ID=CLIENT_ID,
                    CLIENT_SECRET=CLIENT_SECRET,
                )
                biosample_id = biosample.id
                metadata_df.loc[
                    metadata_df["biosample.name"] == row["biosample.name"],
                    "biosample_id",
                ] = biosample_id
                nmdc_database_inst.biosample_set.append(biosample)



[docs]
    def generate_biosample(
        self, biosamp_metadata: dict, CLIENT_ID: str, CLIENT_SECRET: str
    ) -> nmdc.Biosample:
        """
        Mint a biosample id from the given metadata and create a biosample record.

        Parameters
        ----------
        biosamp_metadata : dict
            The metadata object containing biosample information.
        CLIENT_ID : str
            The client ID for the NMDC API.
        CLIENT_SECRET : str
            The client secret for the NMDC API.

        Returns
        -------
        nmdc.Biosample
            The generated biosample instance.

        """

        # If no biosample id in spreadsheet, mint biosample ids
        if biosamp_metadata["id"] is None:
            biosamp_metadata["id"] = self.id_pool.get_id(
                nmdc_type=NmdcTypes.get("Biosample"),
                client_id=CLIENT_ID,
                client_secret=CLIENT_SECRET,
            )

        # Filter dictionary to remove any key/value pairs with None as the value
        biosamp_dict = self.clean_dict(biosamp_metadata)

        # Add provenance metadata
        biosamp_dict["provenance_metadata"] = self.provenance_metadata

        biosample_record = nmdc.Biosample(**biosamp_dict)

        return biosample_record



[docs]
    def load_bio_credentials(self, config_file: str = None) -> str:
        """
        Load bio ontology API key from the environment or a configuration file.

        Parameters
        ----------
        config_file: str
            The path to the configuration file.

        Returns
        -------
        str
            The bio ontology API key.

        Raises
        ------
        FileNotFoundError
            If the configuration file is not found, and the API key is not set in the environment.
        ValueError
            If the configuration file is not valid or does not contain the API key.

        """
        BIO_API_KEY = os.getenv("BIO_API_KEY")

        if not BIO_API_KEY:
            if config_file:
                config_file = Path(config_file)
                try:
                    config = toml.load(config_file)
                    BIO_API_KEY = config.get("BIO_API_KEY")
                except FileNotFoundError:
                    raise FileNotFoundError(f"Config file {config_file} not found.")
                except toml.TomlDecodeError:
                    raise ValueError("Error decoding TOML from the config file.")
                except KeyError:
                    raise ValueError(
                        "Config file must contain BIO_API_KEY to generate biosample ids."
                    )

        if not BIO_API_KEY:
            raise ValueError(
                "BIO_API_KEY must be set either in environment variable or passed in the config file. It must be named BIO_API_KEY."
            )

        return BIO_API_KEY