Source code for nmdc_ms_metadata_gen.data_classes

from __future__ import annotations

from dataclasses import dataclass
from typing import ClassVar, Dict

from nmdc_ms_metadata_gen.schema_bridge import (
    get_curie_for_class,
    get_material_processing_class,
    get_typecode_for_curie,
    list_material_processing_types,
)


[docs] @dataclass class GCMSMetabWorkflowMetadata: """ Data class for holding GCMS metabolomic workflow metadata information. Attributes ---------- sample_id: str Identifier for the sample. nmdc_study : str Identifier for the NMDC study. processed_data_file : str Path or name of the processed data file. raw_data_file : str Path or name of the raw data file. mass_spec_configuration_id : str Identifier for the mass spectrometry configuration used. lc_config_id: str Identifier for the liquid chromatography configuration used. instrument_id: str Identifier for the instrument used for analysis. calibration_ids: list[str] Identifier for the calibration information used. instrument_analysis_start_date: str, optional Start date of the instrument analysis. instrument_analysis_end_date: str, optional End date of the instrument analysis. processing_institution : str Name of the processing institution. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution_generation AND processing_institution_workflow ARE PROVIDED processing_institution_generation : str Name of the processing institution where the data was generated. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution IS PROVIDED processing_institution_workflow : str Name of the processing institution where the workflow was executed. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution IS PROVIDED execution_resource : str, optional Name of the execution resource. Must be a value from ExecutionResourceEnum. raw_data_url : str, optional Complete URL for the raw data file. If provided, this takes precedence over constructing the URL from base_url + filename. manifest_id : str Identifier for the manifest associated with this workflow metadata. instrument_instance_specifier : str, optional Specifier for the instrument instance used in the analysis. """ sample_id: str nmdc_study: str processed_data_file: str raw_data_file: str mass_spec_configuration_id: str lc_config_id: str instrument_id: str calibration_ids: list[str] instrument_analysis_start_date: str = None instrument_analysis_end_date: str = None processing_institution: str = None processing_institution_generation: str = None processing_institution_workflow: str = None execution_resource: str = None raw_data_url: str = None manifest_id: str = None instrument_instance_specifier: str = None
[docs] @dataclass class LCMSLipidWorkflowMetadata: """ Data class for holding LC-MS lipidomics workflow metadata information. Also used for LC-MS Metabolomics workflows. Attributes ---------- processed_data_dir : str Directory containing processed data files. raw_data_file : str Path or name of the raw data file. mass_spec_config_id : str Identifier for the mass spectrometry configuration used. lc_config_id : str Identifier for the liquid chromatography configuration used. instrument_id : str Identifier for the instrument used for analysis. processing_institution : str Name of the processing institution. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution_generation AND processing_institution_workflow ARE PROVIDED processing_institution_generation : str Name of the processing institution where the data was generated. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution IS PROVIDED processing_institution_workflow : str Name of the processing institution where the workflow was executed. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution IS PROVIDED execution_resource : str, optional Name of the execution resource. Must be a value from ExecutionResourceEnum. instrument_analysis_start_date : str, optional Start date of the instrument analysis. instrument_analysis_end_date : str, optional End date of the instrument analysis. raw_data_url : str, optional Complete URL for the raw data file. If provided, this takes precedence over constructing the URL from base_url + filename. manifest_id : str, optional Identifier for the manifest associated with this workflow metadata. instrument_instance_specifier : str, optional Specifier for the instrument instance used in the analysis. """ processed_data_dir: str raw_data_file: str mass_spec_configuration_id: str lc_config_id: str instrument_id: str processing_institution: str = None processing_institution_generation: str = None processing_institution_workflow: str = None execution_resource: str = None instrument_analysis_start_date: str = None instrument_analysis_end_date: str = None raw_data_url: str = None manifest_id: str = None instrument_instance_specifier: str = None
@dataclass class NOMMetadata: """ Data class for holding NOM workflow metadata information. Attributes ---------- raw_data_file : str Path or name of the raw data file. processed_data_directory : str Directory containing processed data files. associated_studies : list List of associated study identifiers. sample_id : str Identifier for the sample. instrument_id : str Identifier for the instrument used for analysis. mass_spec_configuration_id : str Identifier for the mass spectrometry configuration used. lc_config_id : str Identifier for the liquid chromatography configuration used. manifest_id : str Identifier for the manifest associated with this workflow metadata. raw_data_url : str URL of the raw data file, for reruns. OPTIONAL IF raw_data_file IS PROVIDED processing_institution : str Name of the processing institution. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution_generation AND processing_institution_workflow ARE PROVIDED processing_institution_generation : str Name of the processing institution where the data was generated. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution IS PROVIDED processing_institution_workflow : str Name of the processing institution where the workflow was executed. Must be a value from ProcessingInstitutionEnum. OPTIONAL IF processing_institution IS PROVIDED execution_resource : str, optional Name of the execution resource. Must be a value from ExecutionResourceEnum. instrument_instance_specifier : str, optional Specifier for the instrument instance used in the analysis. reference_calibration_id : str Identifier for the reference mass list used for calibration. srfa_calibration_id : str Identifier for the SRFA standard raw data used for recalibration. Only used if the workflow was run with batch recalibration. """ raw_data_file: str processed_data_directory: str associated_studies: list sample_id: str instrument_id: str mass_spec_configuration_id: str lc_config_id: str manifest_id: str raw_data_url: str = None processing_institution: str = None processing_institution_generation: str = None processing_institution_workflow: str = None execution_resource: str = None instrument_instance_specifier: str = None reference_calibration_id: str = None srfa_calibration_id: str = None class ProcessGeneratorMap: """Thin shim around dynamic MaterialProcessing class lookups.""" @staticmethod def get(process_type: str): """Return the runtime NMDC class for the given material processing name.""" return get_material_processing_class(process_type) @staticmethod def available_types(): """Return all material processing class names known to the schema.""" return list_material_processing_types()
[docs] class NmdcTypes: """Resolve CURIEs and ID typecodes on demand.""" def __init__(self): raise NotImplementedError( "NmdcTypes is a static class and cannot be instantiated." ) _curie_cache: ClassVar[dict[str, str]] = {} _typecode_cache: ClassVar[dict[str, str]] = {} _ALIASES: ClassVar[dict[str, str]] = {"TimeStampValue": "TimestampValue"}
[docs] @classmethod def get(cls, identifier: str) -> str: if identifier not in cls._curie_cache: cls._curie_cache[identifier] = cls._resolve_curie(identifier) return cls._curie_cache[identifier]
[docs] @classmethod def typecode(cls, identifier: str) -> str: if identifier not in cls._typecode_cache: curie = identifier if ":" in identifier else cls.get(identifier) typecode = get_typecode_for_curie(curie) if not typecode: raise KeyError( f"NMDC class '{identifier}' does not define an ID typecode" ) cls._typecode_cache[identifier] = typecode return cls._typecode_cache[identifier].preferred_typecode
@classmethod def _resolve_curie(cls, identifier: str) -> str: if ":" in identifier: return identifier canonical = cls._ALIASES.get(identifier, identifier) return get_curie_for_class(canonical)