import requests
from io import StringIO
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
from matplotlib.patches import Patch
import nmdc_api_utilities

# Define a function to split ids into chunks
def split_list(input_list, chunk_size=100):
    result = []
    
    for i in range(0, len(input_list), chunk_size):
        result.append(input_list[i:i + chunk_size])
        
    return result

def get_id_list(result_list: list, id_name: str):
    id_list = []
    for item in result_list:
        if type(item[id_name]) == str:
            id_list.append(item[id_name])
        elif type(item[id_name]) == list:
            for another_item in item[id_name]:
                id_list.append(another_item)

    return id_list

from nmdc_api_utilities.data_object_search import DataObjectSearch
dos_client = DataObjectSearch(env=ENV)
from nmdc_api_utilities.data_processing import DataProcessing
dp_client = DataProcessing()
processed_nom = dos_client.get_record_by_attribute(attribute_name='data_object_type', attribute_value='Direct Infusion FT-ICR MS Analysis Results', max_page_size=100, fields='id,md5_checksum,url', all_pages=True)
# clarify names
for dataobject in processed_nom:
    dataobject["processed_nom_id"] = dataobject.pop("id")
    dataobject["processed_nom_md5_checksum"] = dataobject.pop("md5_checksum")
    dataobject["processed_nom_url"] = dataobject.pop("url")

# convert to df
processed_nom_df = dp_client.convert_to_df(processed_nom)
processed_nom_df

from nmdc_api_utilities.workflow_execution_search import WorkflowExecutionSearch
# since we are querying the WorkflowExecution collection, we need to create an instance of it
we_client = WorkflowExecutionSearch(env=ENV)
# use utility function to get a list of the ids from processed_nom
result_ids = get_id_list(processed_nom, "processed_nom_id")
# create a list of lists of ids to query the WorkflowExecutionSearch object
chunked_list = split_list(result_ids)
analysis_dataobj = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the WorkflowExecutionSearch object looking for data objects where the has_output field is in the chunk of ids from processed_nom
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"has_output": {{"$in": {filter_list}}}}}'
    # get the results
    analysis_dataobj += we_client.get_record_by_filter(filter=filter, fields="id,has_input,has_output", max_page_size=100, all_pages=True)

# clarify names
for dataobject in analysis_dataobj:
    dataobject["analysis_id"] = dataobject.pop("id")
    dataobject["analysis_has_input"] = dataobject.pop("has_input")
    dataobject["analysis_has_output"] = dataobject.pop("has_output")

# convert to data frame
analysis_dataobj_df = dp_client.convert_to_df(analysis_dataobj)
analysis_dataobj_df

# use utility function to get a list of the ids from raw_dataobj
result_ids = get_id_list(analysis_dataobj, "analysis_has_input")
# create a list of lists of ids to query the DataObjectSearch object
chunked_list = split_list(result_ids)
raw_dataobj = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the DataObjectSearch object looking for data objects where the id field is in the chunk of ids from analysis_dataobj
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"id": {{"$in": {filter_list}}}}}'
    # get the results
    raw_dataobj += dos_client.get_record_by_filter(filter=filter, fields="id,name,data_object_type", max_page_size=100, all_pages=True)

# clarify names
for dataobject in raw_dataobj:
    dataobject["raw_id"] = dataobject.pop("id")
    dataobject["raw_name"] = dataobject.pop("name")

# Filter out parameter files (leave NAs in case raw data files are missing a label)
param_dataobj = [file for file in raw_dataobj if 'data_object_type' in file and file['data_object_type'] == 'Analysis Tool Parameter File']
raw_dataobj = [file for file in raw_dataobj if file not in param_dataobj]

raw_df = dp_client.convert_to_df(raw_dataobj)

raw_df

from nmdc_api_utilities.data_generation_search import DataGenerationSearch
dgs_client = DataGenerationSearch(env=ENV)
# use utility function to get a list of the ids from raw_dataobj
result_ids = get_id_list(raw_dataobj, "raw_id")
# create a list of lists of ids to query the DataGenerationSearch object
chunked_list = split_list(result_ids)
datageneration_dataobj = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the DataGenerationSearch object looking for data objects that have the raw_id in the has_output field
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"has_output": {{"$in": {filter_list}}}}}'
    # get the results
    datageneration_dataobj += dgs_client.get_record_by_filter(filter=filter, fields="id,has_input,has_output", max_page_size=100, all_pages=True)

# clarify names
for dataobject in datageneration_dataobj:
    dataobject["datageneration_id"] = dataobject.pop("id")
    dataobject["datageneration_has_output"] = dataobject.pop("has_output")
    dataobject["datageneration_has_input"] = dataobject.pop("has_input")
# convert to data frame
datageneration_dataobj_df = dp_client.convert_to_df(datageneration_dataobj)
datageneration_dataobj_df

datageneration_dataobj_df['datageneration_has_input'].explode().str.extract('nmdc:(procsm|bsm)')[0].value_counts(dropna=False)

0
procsm    2583
Name: count, dtype: int64

from nmdc_api_utilities.material_processing_search import MaterialProcessingSearch
mp_client = MaterialProcessingSearch()

# use utility function to get a unique list of the ids from datageneration_dataobj
result_ids = list(set(get_id_list(datageneration_dataobj, "datageneration_has_input")))

# separate ids into processedsample_ids and biosample_ids
processedsamples = [item for item in result_ids if 'nmdc:procsm' in item]
biosamples = [item for item in result_ids if 'nmdc:bsm' in item]

# dataframe of intermediate processed samples to track
tracking_ps = pd.DataFrame({'intermediate_ps':processedsamples,'dg_input':processedsamples})

# dataframe of tracked biosample inputs mapped to the input given to data generation
tracked = [pd.DataFrame({'biosample_id':biosamples,'dg_input':biosamples})]


while tracking_ps.shape[0]>0:

    # get api calls for these processed samples
    chunked_list = split_list(tracking_ps['intermediate_ps'].tolist())
    mp_dataobj = []
    for chunk in chunked_list:
        filter_list = dp_client._string_mongo_list(chunk)
        filter = f'{{"has_output": {{"$in": {filter_list}}}}}'
        mp_dataobj += mp_client.get_record_by_filter(filter=filter, fields='has_input,has_output', max_page_size=100, all_pages=True)

    mp_dataobj = dp_client.convert_to_df(mp_dataobj).explode('has_input').explode('has_output').merge(tracking_ps,right_on='intermediate_ps',left_on='has_output').drop(['has_output','id'],axis=1)

    # separate steps with biosample vs processed sample inputs
    biosample_input = mp_dataobj[mp_dataobj['has_input'].str.contains('nmdc:bsm',na=False)].drop('intermediate_ps',axis=1).rename(columns={'has_input':'biosample_id'})
    processedsample_input = mp_dataobj[mp_dataobj['has_input'].str.contains('nmdc:procsm',na=False)].drop('intermediate_ps',axis=1).rename(columns={'has_input':'intermediate_ps'}) # new ps input is now intermediate_ps

    # update tracking_ps
    tracking_ps = processedsample_input

    # update tracked to those steps with a biosample input
    if not biosample_input.empty:
        tracked.append(biosample_input)

datageneration_inputupdate_df = pd.concat(tracked, ignore_index=True).merge(datageneration_dataobj_df.explode('datageneration_has_input').explode('datageneration_has_output'),left_on='dg_input',right_on='datageneration_has_input').\
    drop(['dg_input','datageneration_has_input'],axis=1).drop_duplicates()
datageneration_inputupdate_df

from nmdc_api_utilities.biosample_search import BiosampleSearch
bs_client = BiosampleSearch(env=ENV)

# get a list of the biosample ids from datageneration_inputupdate_df
result_ids = datageneration_inputupdate_df["biosample_id"].unique().tolist()
# create a list of lists of ids to query the BiosampleSearch object
chunked_list = split_list(result_ids)
biosample_dataobj = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the BiosampleSearch object looking for data objects where the id is in the chunk of ids from datageneration_dataobj
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"id": {{"$in": {filter_list}}}}}'
    # get the results
    biosample_dataobj += bs_client.get_record_by_filter(filter=filter, max_page_size=100, all_pages=True)

# clarify names
for dataobject in biosample_dataobj:
    dataobject["biosample_id"] = dataobject.pop("id")

# convert to data frame
biosample_dataobj_df = dp_client.convert_to_df(biosample_dataobj)

biosample_dataobj_df

biosample_dataobj_flat=pd.json_normalize(biosample_dataobj)
biosample_dataobj_flat_df=dp_client.convert_to_df(biosample_dataobj_flat)

biosample_dataobj_flat_df['sample_type']=""

biosample_dataobj_flat_df['env_medium.term.id'].drop_duplicates()

biosample_dataobj_flat_df.loc[biosample_dataobj_flat_df['env_medium.term.name'].str.contains('soil', na = False),'sample_type'] = 'soil'

biosample_dataobj_flat_df.loc[biosample_dataobj_flat_df['env_medium.term.id'].isin(["ENVO:00002261","ENVO:00001998","ENVO:00002259",
                                                                                    "ENVO:01001616","ENVO:00005750","ENVO:00005761",
                                                                                    "ENVO:00005760","ENVO:00005773","ENVO:00005802",
                                                                                    "ENVO:00005774"]),'sample_type'] = 'soil'
biosample_dataobj_flat_df.loc[biosample_dataobj_flat_df['env_medium.term.id'].isin(["ENVO:00002042"]),'sample_type'] = 'water'
biosample_dataobj_flat_df.loc[biosample_dataobj_flat_df['env_medium.term.id'].isin(["ENVO:00002007"]),'sample_type'] = 'sediment'
biosample_dataobj_flat_df.loc[biosample_dataobj_flat_df['env_medium.term.id'].isin(["ENVO:01000017"]),'sample_type'] = 'sand'

#filter to desired metadata columns
biosample_dataobj_flat_df=biosample_dataobj_flat_df[['biosample_id','geo_loc_name.has_raw_value','env_medium.term.name','env_medium.term.id','sample_type']]

biosample_dataobj_flat_df

#match all processed nom objects (via processed_nom_id) to analysis objects (via analysis_has_output) and expand lists has_input and has_output
processed_obj_to_analysis_df=dp_client.merge_df(processed_nom_df,analysis_dataobj_df,"processed_nom_id","analysis_has_output")

#match raw data objects (via raw_id) to all_analysis_df (via analysis_has_input)
processed_obj_to_raw_df=dp_client.merge_df(raw_df,processed_obj_to_analysis_df,"raw_id","analysis_has_input")

#match processed_obj_to_raw_df (via raw_id) to data generation objects (via datageneration_has_output) and expand lists has_input and has_output
processed_obj_to_datagen_df=dp_client.merge_df(processed_obj_to_raw_df,datageneration_inputupdate_df,"raw_id","datageneration_has_output")

#match biosample objects (via biosample_id) to processed_obj_to_datagen_df (via datageneration_has_input)
merged_df=dp_client.merge_df(biosample_dataobj_flat_df,processed_obj_to_datagen_df,"biosample_id","biosample_id")

merged_df

#are there any md5_checksum values that occur more than once (i.e. are associated with more than one processed nom id)
len(merged_df[merged_df.duplicated('processed_nom_md5_checksum')]['processed_nom_md5_checksum'].unique())==0

True

column_list = merged_df.columns.tolist()
columns_to_keep = ["biosample_id","processed_nom_id","sample_type","processed_nom_url"]
columns_to_remove = list(set(column_list).difference(columns_to_keep))

# Drop unnecessary columns
merged_df_cleaned = merged_df.drop(columns=columns_to_remove)

# remove duplicate rows when keeping only necessary columns (should remove none)
final_df=merged_df_cleaned.drop_duplicates(keep="first").sort_values(by='processed_nom_id').reset_index(drop=True)

final_df

dataset_counts = final_df[['processed_nom_id','sample_type']].drop_duplicates().value_counts('sample_type')
min_n = min(pd.DataFrame(dataset_counts)['count'])

#set sampling size to 50 or the smallest sample count, whichever value is smaller
if 50 < min_n:
    n = 50
else:
    n = min_n

#list the different types
list_type=dataset_counts.index.tolist()

#for each type, randomly sample n data sets and save them into list
random_subset=[]
for type in list_type:
    #each processed ID and sample type
    sample_type=final_df[['processed_nom_id','sample_type']].drop_duplicates()
    #filter to current sample type
    sample_type=sample_type[sample_type['sample_type']==type]
    #randomly sample n processed IDs in current sample type
    sample_type=sample_type.sample(n=n, random_state=2)
    #save
    random_subset.append(sample_type)

#resave list as dataframe
random_subset=pd.concat(random_subset).reset_index(drop=True)

#remerge rest of the data for the sampled data sets
final_df=random_subset.merge(final_df,on=['processed_nom_id','sample_type'],how="left")

final_df

#example file
url=final_df.iloc[0]["processed_nom_url"]

#pull data as csv using url
response = requests.get(url)
csv_data = StringIO(response.text)
csv_df = pd.read_csv(csv_data)
csv_data.close()
csv_df

errors = {}
iteration_counter = 0
mol_dict=[]
multi_peakmatch_file_counter=0
multi_peakmatch_peak_counter=0

for index, row in final_df.iterrows():

    iteration_counter += 1

    # print an update for every 50 iterations
    if iteration_counter % 50 == 0:
        print(f"Processed {iteration_counter} rows")
    
    #save data set level information
    url = row["processed_nom_url"]
    processed=row['processed_nom_id']
    sample_type=row['sample_type']

    try:
        
        # get CSV data using URL
        response = requests.get(url)
        csv_data = StringIO(response.text)
        csv_df = pd.read_csv(csv_data)
        csv_data.close()

        #calculate assigned peak percent
        unassigned=csv_df['Molecular Formula'].isnull().sum()
        assigned=csv_df['Molecular Formula'].count()
        assigned_perc=assigned/(unassigned+assigned)

        # If there are no rows, don't include this csv, skip to the next one
        if assigned + unassigned == 0:
            print(f"No peaks found in {processed}, skipping file")
            continue
        
        # If there are some rows, and no assigned peaks, the data might fail the shape assertions and break the notebook
        # which is not informative since it would get quality filtered out anyway. Skip to the next one
        elif unassigned > 0 and assigned == 0:
            print(f"No assigned peaks found in {processed}, skipping file")
            continue

        # If there are some rows, and at least one assigned peak, assert that the important columns are correct
        # This we DO want to break if it's wrong
        elif assigned > 0:

            # Workflow Output Tests
            # assert that all four columns are found in the file
            assert set(['Molecular Formula','H/C','O/C','Confidence Score']).issubset(csv_df.columns), "Workflow output does not contain the required columns. Check for workflow changes"

            # assert that H/C and O/C and Confidence Score are numeric
            assert pd.api.types.is_numeric_dtype(csv_df['H/C']), "Column 'H/C' is not numeric. Check for workflow changes"
            assert pd.api.types.is_numeric_dtype(csv_df['O/C']), "Column 'O/C' is not numeric. Check for workflow changes"
            assert pd.api.types.is_numeric_dtype(csv_df['Confidence Score']), "Column 'Confidence Score' is not numeric. Check for workflow changes"
            assert pd.api.types.is_object_dtype(csv_df['Molecular Formula']), "Column 'Molecular Formula' is not a string. Check for workflow changes"
        


        #check if any peaks (m/z) have multiple matches. if so take highest scoring match

        #list of peaks
        peak_matches=csv_df["m/z"]
        #list of duplicated peaks in csv_df
        dup_peaks=peak_matches[peak_matches.duplicated()]
        multi_peakmatch_peak_counter+=len(dup_peaks)
        #if there are duplicate peaks:
        if len(dup_peaks) > 0:
            multi_peakmatch_file_counter+=1
            #group cv_df by m/z and filter to max confidence score
            idx = csv_df.groupby('m/z')['Confidence Score'].max()
            #merge back with original df (that has all columns) to filter duplicate peak entries
            csv_df=csv_df.merge(idx,on=['m/z','Confidence Score'])


        #make dictionary
        #columns to be made into lists
        mol_df=csv_df[['Molecular Formula','H/C','O/C','Confidence Score']]

        #drop unassigned peaks
        mol_df=mol_df.dropna(subset='Molecular Formula')

        #append dictionary
        mol_dict.append({'processed':processed,
                         'sample_type':sample_type,
                         'assigned_peak_count':assigned,
                         'assigned_perc':assigned_perc,
                         'mol_form':mol_df['Molecular Formula'].to_list(),
                         'H/C':mol_df['H/C'].to_list(),
                         'O/C':mol_df['O/C'].to_list(),
                         'Confidence Score':mol_df['Confidence Score'].to_list()
                        })
    
    #if error print info
    except AssertionError as ae:
        print(f"Assertion error occurred: {ae}")
        errors["processed_id"] = processed
        errors["url"] = url
        break

    #if error print info
    except Exception as e:
        print(f"An error occurred: {e}")
        errors["processed_id"] = processed
        errors["url"] = url
        continue

#turn dictionaries into dataframes
nom_summary_df=pd.DataFrame(mol_dict)

nom_summary_df

Processed 50 rows

Processed 100 rows

Processed 150 rows

Processed 200 rows

peak_count_plot=sns.FacetGrid(nom_summary_df,col="sample_type",hue="sample_type",sharex=False,sharey=False)
peak_count_plot.map(sns.histplot,'assigned_peak_count')
peak_count_plot.set_xlabels("number of identified peaks")
peak_count_plot.set_ylabels("data set count")
peak_count_plot.set_titles(col_template="{col_name}")

peak_perc_plot=sns.FacetGrid(nom_summary_df,col="sample_type",hue="sample_type",sharex=False,sharey=False)
peak_perc_plot.map(sns.histplot,'assigned_perc')
peak_perc_plot.set_xlabels("percent of peaks identified")
peak_perc_plot.set_ylabels("data set count")
peak_perc_plot.set_titles(col_template="{col_name}")

<seaborn.axisgrid.FacetGrid at 0x7f3ce9204190>

#filter data sets according to stats on peak assignment
nom_filt=nom_summary_df[nom_summary_df['assigned_peak_count']>=250]
nom_filt=nom_filt[nom_filt['assigned_perc']>=0.25]

#expand listed columns in molecular formula df
nom_filt_expanded=nom_filt.explode(['mol_form','H/C','O/C','Confidence Score'])

#resave expanded columns as numeric
nom_filt_expanded['O/C']=pd.to_numeric(nom_filt_expanded['O/C'])
nom_filt_expanded['H/C']=pd.to_numeric(nom_filt_expanded['H/C'])
nom_filt_expanded['Confidence Score']=pd.to_numeric(nom_filt_expanded['Confidence Score'])

#metadata group column
grouping_column='sample_type'

#count number of datasets in each type
count_type=nom_filt_expanded[['processed',grouping_column]].drop_duplicates().value_counts(grouping_column)

count_type

sample_type
soil        33
water       15
sediment     8
sand         6
Name: count, dtype: int64

#determine sampling size based on counts above
n=min(pd.DataFrame(count_type)['count'])

#list the different types
list_type=count_type.index.tolist()

#for each type, randomly sample n data sets and save them into list
nom_sampled=[]
for type in list_type:
    #each processed ID and sample type
    nom_type=nom_filt_expanded[['processed',grouping_column]].drop_duplicates()
    #filter to current sample type
    nom_type=nom_type[nom_type[grouping_column]==type]
    #randomly sample n processed IDs in current sample type
    nom_type=nom_type.sample(n=n, random_state=2)
    #save
    nom_sampled.append(nom_type)

#resave list as dataframe
nom_sampled=pd.concat(nom_sampled)

#remerge rest of the data for the sampled data sets
nom_sampled=nom_sampled.merge(nom_filt_expanded,on=['processed',grouping_column],how="left")

nom_sampled

#histogram of confidence scores for randomly selected samples
nom_sampled.hist("Confidence Score",bins=np.arange(0,1,0.1))
plt.title('Confidence Scores from Sampled Data Sets')
plt.xlabel("Confidence Score")
plt.xticks(np.arange(0,1,0.1))
plt.ylabel("Frequency")
plt.show()

#peak filtering by confidence score
conf_filt=nom_sampled[nom_sampled['Confidence Score']>=0.3]

#count for each molecular formula
mol_counts=conf_filt.value_counts('mol_form').to_frame().reset_index()

#histogram of molecular formula counts
mol_counts.hist("count",bins=np.arange(0,50,5))
plt.locator_params(axis='x')
plt.title('Molecular Formula Counts Across Sampled Data Sets')
plt.xlabel("Number of Occurences Across Data Sets")
plt.xticks(np.arange(0,50,5))
plt.ylabel("Number of Molecular Formula")
plt.show()

#based on this histogram, filter to formulas in more than 5 data sets
mol_counts=mol_counts[mol_counts['count']>=5]
mol_filter=mol_counts.merge(conf_filt,on=['mol_form'],how="left")

#sample type colors

#set colors for each sample type
type_col=pd.DataFrame({grouping_column:mol_filter[grouping_column].unique(),'color':['#8B4513','olivedrab','blue','#B8860B']})

#setup color legend for sample type
type_col_dict = dict(zip(type_col[grouping_column].unique(), ['#8B4513','olivedrab','blue','#B8860B']))
handles = [Patch(facecolor=type_col_dict[name]) for name in type_col_dict]

#map graph colors based on sample type to processed IDs
sample_type=mol_filter[[grouping_column,'processed']].drop_duplicates()
sample_type_col=sample_type.merge(type_col,how='left',on=grouping_column).set_index('processed').drop(grouping_column,axis=1).rename(columns={'color':grouping_column})

#presence/absence

#set colors for presence/absence as cmap
colors=['white','black']
custom_palette = sns.color_palette(colors)
custom_cmap = sns.color_palette(custom_palette, as_cmap=True)

#add column indicating presence in that processed nom id (1 is present)
mol_filter['presence']=1

#create presence/absence matrix. replace NA with 0 (0 is absent)
formula_matrix=mol_filter[['mol_form','processed','presence']].pivot_table('presence', index='mol_form', columns='processed').fillna(0).astype(int)

#heatmap (1 is present, zero is absent)
g=sns.clustermap(data=formula_matrix,col_colors=sample_type_col,tree_kws={"linewidths": 0.},xticklabels=False,yticklabels=False,cmap=custom_cmap)
g.figure.suptitle("Presence of Molecular Formulas Across NOM Data Sets")
g.ax_heatmap.set_xlabel("Processed NOM Data Sets")
g.ax_heatmap.set_ylabel("Molecular Formulas")

#adjust plot and legend locations
g.figure.subplots_adjust(top=1.15,right=0.8)
g.ax_cbar.set_position((0.85, 0.45, .05, .05)) #x axis,y axis, width, height

#adjust cbar legend to indicate presence/absence
g.ax_cbar.set_yticks([0.25,0.75])
g.ax_cbar.set_yticklabels(["absent","present"])

#add sample type legend
plt.legend(handles, type_col_dict, title=None,
           bbox_to_anchor=(0.95, 1.05), bbox_transform=plt.gcf().transFigure)

/opt/hostedtoolcache/Python/3.11.14/x64/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

/opt/hostedtoolcache/Python/3.11.14/x64/lib/python3.11/site-packages/seaborn/matrix.py:560: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)

<matplotlib.legend.Legend at 0x7f3ce8b380d0>

# the same molecular formula will have the same H/C and O/C value in every data set, so dots are only unique per sample type
vankrev_data=mol_filter[[grouping_column,'mol_form','H/C','O/C']].drop_duplicates()

#make van krevlen plot
g=sns.FacetGrid(vankrev_data,col=grouping_column,hue=grouping_column,palette=type_col_dict)
g.map(sns.scatterplot,'O/C','H/C',s=10)
g.set_titles(col_template="{col_name}")
g.figure.suptitle("Van Krevelen Plots By General Sample Type")
g.figure.subplots_adjust(top=0.8)

#count the number of sample types in which each molecular formula is found
density_counts=vankrev_data['mol_form'].value_counts()

#map counts back to H/C and O/C data
vankrev_data['common_to'] = vankrev_data['mol_form'].map(density_counts)

#filter molecular formual present in either all four sample types or in only one
vankrev_data=vankrev_data[vankrev_data['common_to'].isin([1,4])]

#reformat so sample type is 'all' when molecular formula was in all
vankrev_data.loc[vankrev_data['common_to']==4,'sample_type']='all types'

#add 'all' to sample type color dictionary
type_col_dict.update({'all types':'black'})

#make marginal density plot
sns.jointplot(data=vankrev_data, x="O/C", y="H/C", kind="scatter", hue=grouping_column,palette=type_col_dict,s=12,alpha=0.5)
plt.legend(markerscale=3,title="unique to")
plt.suptitle("Marginal Density Plot of Molecular Formulas by Sample Type",y=1.02)
plt.show()

	processed_nom_id	processed_nom_md5_checksum	processed_nom_url
0	nmdc:dobj-11-003x7710	0391d9ff37a1926d5cf0eaec126ad4ff	https://nmdcdemo.emsl.pnnl.gov/nom/blanchard/r...
1	nmdc:dobj-11-00dewm52	2a532dca15798e470103ebd752a0937f	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
2	nmdc:dobj-11-00wm3313	3ce562ac512457ea54bdda05a4f01ede	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
3	nmdc:dobj-11-01kye625	38930c28eae561bc807bd01823f04167	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
4	nmdc:dobj-11-02trja88	e6bafa5fabbebfb0061aa2587e223979	https://nmdcdemo.emsl.pnnl.gov/nom/grow/result...
...	...	...	...
2578	nmdc:dobj-13-zrp1qw41	98b97b78fff542b66e72f4b3f792d80f	https://nmdcdemo.emsl.pnnl.gov/nom/results/SBR...
2579	nmdc:dobj-13-zsqpnm92	3e9e19910edb209d211d9f915e36b8cb	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...
2580	nmdc:dobj-13-zvnmsp76	aec0521d6a36a440e41052f8eadc0d1d	https://nmdcdemo.emsl.pnnl.gov/nom/results/Ung...
2581	nmdc:dobj-13-zvzx2462	9f0d52cc46d247b8d2ba12d5842b9fb6	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...
2582	nmdc:dobj-13-zye5fe51	8ddaeeffe93db9c4258a03e881d329cf	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...

	analysis_id	analysis_has_input	analysis_has_output
0	nmdc:wfnom-11-dv3wck24.1	[nmdc:dobj-11-2ysghy98, nmdc:dobj-11-bmeg5b74]	[nmdc:dobj-11-003x7710]
1	nmdc:wfnom-11-0mqv1c63.1	[nmdc:dobj-11-3hfzr472]	[nmdc:dobj-11-00dewm52]
2	nmdc:wfnom-11-twkd5a03.1	[nmdc:dobj-11-2mk4qf09]	[nmdc:dobj-11-00wm3313]
3	nmdc:wfnom-11-ftaq2319.1	[nmdc:dobj-11-7wynt887]	[nmdc:dobj-11-01kye625]
4	nmdc:wfnom-11-2dcp9q04.1	[nmdc:dobj-11-ge6pre79]	[nmdc:dobj-11-02trja88]
...	...	...	...
2578	nmdc:wfnom-13-08q56295.1	[nmdc:dobj-13-vg0tgv60]	[nmdc:dobj-13-zrp1qw41]
2579	nmdc:wfnom-13-yy5yqx31.1	[nmdc:dobj-13-w15zk074]	[nmdc:dobj-13-zsqpnm92]
2580	nmdc:wfnom-13-4xrsd836.1	[nmdc:dobj-13-3rmgcm64]	[nmdc:dobj-13-zvnmsp76]
2581	nmdc:wfnom-13-h0r53g59.1	[nmdc:dobj-13-m3zkxg02]	[nmdc:dobj-13-zvzx2462]
2582	nmdc:wfnom-13-4vmrb525.1	[nmdc:dobj-13-60z8rt70]	[nmdc:dobj-13-zye5fe51]

	data_object_type	raw_id	raw_name
0	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-11-04embv91	Lybrand_FT_62_W_23Aug19_Alder_Infuse_p3_1_01_4...
1	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-11-04ny1n21	Lybrand_FT_36_C_30Aug19_Alder_Infuse_p05_1_01_...
2	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-11-09p17z03	Lybrand_Permafrost_BOG_14_CHCl3_13Dec19_Alder_...
3	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-11-0cmhqk17	WHONDRS_S19S_0059_ICR_1_43_Alder_Inf_13Sept19_...
4	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-11-10madc26	WHONDRS_S19S_015_17Mar20_Alder_Infuse_P2_1_01_...
...	...	...	...
2578	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-13-vcsgr863	output: Brodie_181_MeOH_R2_21Mar19_HESI_Neg
2579	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-13-vg0tgv60	output: SBR_FC_S3_30-40_H2Oext_13Oct15_Leopard...
2580	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-13-vya84d93	output: Brodie_152_w_r2_29Jan19_HESI_neg
2581	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-13-w15zk074	output: Brodie_135A_H2O_15Oct18_IAT_p1_1_01_35894
2582	Direct Infusion FT ICR-MS Raw Data	nmdc:dobj-13-zazrqk87	output: Brodie_185_w_r1_01Feb19_HESI_neg

	datageneration_id	datageneration_has_output	datageneration_has_input
0	nmdc:omprc-11-vfj8z279	[nmdc:dobj-11-01q67939]	[nmdc:procsm-11-n1b2qg63]
1	nmdc:omprc-11-adjx8k29	[nmdc:dobj-11-04embv91]	[nmdc:procsm-11-xsfbpz40]
2	nmdc:omprc-11-w1kvtj73	[nmdc:dobj-11-04ny1n21]	[nmdc:procsm-11-f7kfky42]
3	nmdc:omprc-11-rj3bqn04	[nmdc:dobj-11-09p17z03]	[nmdc:procsm-11-ckacrz04]
4	nmdc:omprc-11-a1szxs11	[nmdc:dobj-11-0cmhqk17]	[nmdc:procsm-11-zq6vzm59]
...	...	...	...
2578	nmdc:omprc-11-077nww93	[nmdc:dobj-13-zjrg8w43]	[nmdc:procsm-11-s1ypy935]
2579	nmdc:omprc-11-s9xtq276	[nmdc:dobj-13-zpzv3z08]	[nmdc:procsm-11-2nexmt97]
2580	nmdc:omprc-13-8hsd4d19	[nmdc:dobj-13-zymvcs73]	[nmdc:procsm-11-4bpnn544]
2581	nmdc:omprc-13-ef1v6r85	[nmdc:dobj-13-zz1a8884]	[nmdc:procsm-11-8hrn7z73]
2582	nmdc:omprc-13-xemx6b61	[nmdc:dobj-13-zzzyae97]	[nmdc:procsm-11-t7qjg604]

	biosample_id	datageneration_id	datageneration_has_output
0	nmdc:bsm-13-gxb7yg84	nmdc:omprc-13-zr3chw40	nmdc:dobj-13-ar897938
1	nmdc:bsm-13-gxb7yg84	nmdc:omprc-13-5dx8hd45	nmdc:dobj-13-tnr8r676
2	nmdc:bsm-13-pwrd7087	nmdc:omprc-13-jsn1dv78	nmdc:dobj-13-9w8tgj98
3	nmdc:bsm-13-pwrd7087	nmdc:omprc-13-11nhsd04	nmdc:dobj-13-jgx9v635
4	nmdc:bsm-13-pwrd7087	nmdc:omprc-13-x5ht3965	nmdc:dobj-13-v5mwxs63
...	...	...	...
3742	nmdc:bsm-11-8gmz9s21	nmdc:omprc-11-zxwaqh14	nmdc:dobj-11-3gc9j491
3743	nmdc:bsm-11-8gmz9s21	nmdc:omprc-11-vmf9t442	nmdc:dobj-11-3gdemg77
3744	nmdc:bsm-11-2pympb23	nmdc:omprc-11-x7se8f64	nmdc:dobj-11-vwvjp551
3745	nmdc:bsm-11-2pympb23	nmdc:omprc-11-g4jv2x30	nmdc:dobj-11-hnk8w453
3746	nmdc:bsm-11-2pympb23	nmdc:omprc-11-yn7s0787	nmdc:dobj-11-pd3xmr64

Data Exploration and Visualization of NOM Data in NMDC (FT ICR-MS)¶

Define a function to split a list into chunks¶

Define a function to get a list of ids from initial results¶

Gather the IDs for processed NOM results in NMDC by filtering for data objects of type "Direct Infusion FT ICR-MS"¶

Continue traversing the NMDC schema by using the list of identifiers from the previous API call to query the next collection¶

Create final data frame of relevant metadata and NMDC schema information for each NOM processed data object¶

Calculate quality control statistics on the sampled processed nom results (this can take a while to run)¶

Perform quality control filtering of data sets using the chosen metadata information¶

Assess patterns in the molecular formulas from different sample types¶

	type	name	associated_studies	env_broad_scale	env_local_scale	env_medium	samp_name	collection_date	depth	ecosystem	...	samp_collec_method	sieving	tot_phosp	collection_time	carb_nitro_ratio	water_content	soil_horizon	tot_org_carb	samp_taxon_id	sample_link
0	nmdc:Biosample	BW-C-7-M	[nmdc:sty-11-8ws97026]	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	BW-C-7-M	{'type': 'nmdc:TimestampValue', 'has_raw_value...	{'type': 'nmdc:QuantityValue', 'has_raw_value'...	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	nmdc:Biosample	Bulk soil microbial communities from the East ...	[nmdc:sty-11-dcqce727]	{'has_raw_value': 'ENVO:01000177', 'term': {'i...	{'has_raw_value': 'ENVO:00000292', 'term': {'i...	{'has_raw_value': 'ENVO:00005802', 'term': {'i...	ER_369	{'has_raw_value': '2017-09-15', 'type': 'nmdc:...	{'has_raw_value': '0.15', 'has_numeric_value':...	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	nmdc:Biosample	Bulk soil microbial communities from the East ...	[nmdc:sty-11-dcqce727]	{'has_raw_value': 'ENVO:01000177', 'term': {'i...	{'has_raw_value': 'ENVO:00000292', 'term': {'i...	{'has_raw_value': 'ENVO:00005802', 'term': {'i...	ER_181	{'has_raw_value': '2017-06-08', 'type': 'nmdc:...	{'has_raw_value': '0.0', 'has_numeric_value': ...	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	nmdc:Biosample	Bulk soil microbial communities from the East ...	[nmdc:sty-11-dcqce727]	{'has_raw_value': 'ENVO:01000177', 'term': {'i...	{'has_raw_value': 'ENVO:00000292', 'term': {'i...	{'has_raw_value': 'ENVO:00005802', 'term': {'i...	ER_379	{'has_raw_value': '2017-09-15', 'type': 'nmdc:...	{'has_raw_value': '0.0', 'has_numeric_value': ...	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	nmdc:Biosample	Bulk soil microbial communities from the East ...	[nmdc:sty-11-dcqce727]	{'has_raw_value': 'ENVO:01000177', 'term': {'i...	{'has_raw_value': 'ENVO:00000292', 'term': {'i...	{'has_raw_value': 'ENVO:00005802', 'term': {'i...	ER_356	{'has_raw_value': '2017-09-15', 'type': 'nmdc:...	{'has_raw_value': '0.05', 'has_numeric_value':...	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1031	nmdc:Biosample	WHONDRS_S19S_0044_SW	[nmdc:sty-11-5tgfr349]	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	NaN	{'has_raw_value': '2019-08-06', 'type': 'nmdc:...	NaN	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	{'has_raw_value': 'freshwater metagenome [NCBI...	NaN
1032	nmdc:Biosample	WHONDRS_S19S_0041_SW	[nmdc:sty-11-5tgfr349]	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	NaN	{'has_raw_value': '2019-08-05', 'type': 'nmdc:...	NaN	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	{'has_raw_value': 'freshwater metagenome [NCBI...	NaN
1033	nmdc:Biosample	WHONDRS_S19S_0069_SW	[nmdc:sty-11-5tgfr349]	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	NaN	{'has_raw_value': '2019-08-12', 'type': 'nmdc:...	NaN	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	{'has_raw_value': 'freshwater metagenome [NCBI...	NaN
1034	nmdc:Biosample	WHONDRS_S19S_0078_SW	[nmdc:sty-11-5tgfr349]	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	NaN	{'has_raw_value': '2019-09-06', 'type': 'nmdc:...	NaN	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	{'has_raw_value': 'freshwater metagenome [NCBI...	NaN
1035	nmdc:Biosample	WHONDRS_S19S_0021_SW	[nmdc:sty-11-5tgfr349]	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	{'type': 'nmdc:ControlledIdentifiedTermValue',...	NaN	{'has_raw_value': '2019-08-20', 'type': 'nmdc:...	NaN	Environmental	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	{'has_raw_value': 'freshwater metagenome [NCBI...	NaN

	biosample_id	geo_loc_name.has_raw_value	env_medium.term.name	env_medium.term.id	sample_type
0	nmdc:bsm-11-042nd237	USA: Massachusetts, Petersham	forest soil	ENVO:00002261	soil
1	nmdc:bsm-11-07spy989	USA: Colorado	NaN	ENVO:00005802	soil
2	nmdc:bsm-11-0pq46m48	USA: Colorado	NaN	ENVO:00005802	soil
3	nmdc:bsm-11-29d9a657	USA: Colorado	NaN	ENVO:00005802	soil
4	nmdc:bsm-11-2h2j6s31	USA: Colorado	NaN	ENVO:00005802	soil
...	...	...	...	...	...
1031	nmdc:bsm-11-x1bg5007	USA: South Carolina, Cordesville	surface water	ENVO:00002042	water
1032	nmdc:bsm-11-xnhqfj54	USA: Oregon, Prineville	surface water	ENVO:00002042	water
1033	nmdc:bsm-11-xy9ahf25	USA: California, Ft. Bragg	surface water	ENVO:00002042	water
1034	nmdc:bsm-11-yx0xgn55	USA: Oregon, Tyee	surface water	ENVO:00002042	water
1035	nmdc:bsm-11-zjscaw68	USA: Alabama, Demopolis	surface water	ENVO:00002042	water

	biosample_id	sample_type	processed_nom_id	processed_nom_url
0	nmdc:bsm-11-cwey4y35	soil	nmdc:dobj-11-003x7710	https://nmdcdemo.emsl.pnnl.gov/nom/blanchard/r...
1	nmdc:bsm-11-sk8vv550	soil	nmdc:dobj-11-00dewm52	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
2	nmdc:bsm-11-yzgc1096	soil	nmdc:dobj-11-00wm3313	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
3	nmdc:bsm-11-e3m5pt51	soil	nmdc:dobj-11-01kye625	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
4	nmdc:bsm-11-80jr8p82	soil	nmdc:dobj-11-02trja88	https://nmdcdemo.emsl.pnnl.gov/nom/grow/result...
...	...	...	...	...
2578	nmdc:bsm-13-tr7n0581	sand	nmdc:dobj-13-zrp1qw41	https://nmdcdemo.emsl.pnnl.gov/nom/results/SBR...
2579	nmdc:bsm-11-hf7a3g98	soil	nmdc:dobj-13-zsqpnm92	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...
2580	nmdc:bsm-13-4pbbvj63	sand	nmdc:dobj-13-zvnmsp76	https://nmdcdemo.emsl.pnnl.gov/nom/results/Ung...
2581	nmdc:bsm-11-mbnqn650	soil	nmdc:dobj-13-zvzx2462	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...
2582	nmdc:bsm-11-k05a2416	soil	nmdc:dobj-13-zye5fe51	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...

	processed_nom_id	sample_type	biosample_id	processed_nom_url
0	nmdc:dobj-11-dn0zxg92	soil	nmdc:bsm-11-xvnjap82	https://nmdcdemo.emsl.pnnl.gov/nom/grow/result...
1	nmdc:dobj-11-bzg32p17	soil	nmdc:bsm-11-rrmwch98	https://nmdcdemo.emsl.pnnl.gov/nom/neon/lybran...
2	nmdc:dobj-11-nvb86885	soil	nmdc:bsm-11-pdtj6h88	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
3	nmdc:dobj-11-wzyfp957	soil	nmdc:bsm-11-6p6mhh86	https://nmdcdemo.emsl.pnnl.gov/nom/1000soils/r...
4	nmdc:dobj-13-9gwc6a87	soil	nmdc:bsm-11-q71pns38	https://nmdcdemo.emsl.pnnl.gov/nom/results/Bro...
...	...	...	...	...
195	nmdc:dobj-13-cw804j48	sand	nmdc:bsm-13-ap544f22	https://nmdcdemo.emsl.pnnl.gov/nom/results/SBR...
196	nmdc:dobj-13-gatmnn76	sand	nmdc:bsm-13-xvyhr616	https://nmdcdemo.emsl.pnnl.gov/nom/results/SBR...
197	nmdc:dobj-13-nsfrf494	sand	nmdc:bsm-13-sy2a9t67	https://nmdcdemo.emsl.pnnl.gov/nom/results/SBR...
198	nmdc:dobj-13-qdpv9v42	sand	nmdc:bsm-13-mk8xn190	https://nmdcdemo.emsl.pnnl.gov/nom/results/Ung...
199	nmdc:dobj-13-bdyz9y36	sand	nmdc:bsm-13-sy2a9t67	https://nmdcdemo.emsl.pnnl.gov/nom/results/Ung...

	Index	m/z	Calibrated m/z	Calculated m/z	Peak Height	Peak Area	Resolving Power	S/N	Ion Charge	m/z Error (ppm)	...	Ion Type	Is Isotopologue	Mono Isotopic Index	Molecular Formula	C	H	O	S	N	13C
0	0	900.125820	900.126036	900.126495	1.345499e+06	16585.425742	7.632381e+04	8.758887	-1	-0.510143	...	de-protonated	0.0	NaN	C42 H31 O22 N1	42.0	31.0	22.0	NaN	1.0	NaN
1	4	899.649633	899.649848	899.649888	1.014133e+06	10281.746451	7.636398e+04	6.601772	-1	-0.043821	...	de-protonated	0.0	NaN	C47 H96 O13 S1	47.0	96.0	13.0	1.0	NaN	NaN
2	5	899.634336	899.634551	899.634770	1.213550e+06	11139.424542	9.163838e+04	7.899927	-1	-0.242915	...	de-protonated	0.0	NaN	C64 H84 O3	64.0	84.0	3.0	NaN	NaN	NaN
3	6	899.322820	899.323034	899.322557	9.292093e+05	13168.509602	6.547870e+04	6.048938	-1	0.531086	...	de-protonated	0.0	NaN	C59 H48 O9	59.0	48.0	9.0	NaN	NaN	NaN
4	7	899.145691	899.145905	899.146502	1.297759e+06	10598.710188	1.146105e+05	8.448112	-1	-0.663726	...	de-protonated	0.0	NaN	C47 H32 O19	47.0	32.0	19.0	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10332	10332	105.150552	105.150491	NaN	9.256716e+05	129.799973	7.840292e+05	6.025908	-1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10333	10333	105.047562	105.047501	NaN	9.549125e+05	131.008418	6.539981e+05	6.216260	-1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10334	10334	104.965050	104.964989	NaN	1.008435e+06	182.325337	6.545125e+05	6.564676	-1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10335	10335	104.014316	104.014256	NaN	9.708627e+05	100.081030	9.907425e+05	6.320091	-1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10336	10336	102.197131	102.197074	NaN	1.071442e+06	107.653102	1.008359e+06	6.974842	-1	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	processed	sample_type	assigned_peak_count	assigned_perc	mol_form	H/C	O/C	Confidence Score
0	nmdc:dobj-11-dn0zxg92	soil	4150	0.401470	[C42 H31 O22 N1, C47 H96 O13 S1, C64 H84 O3, C...	[0.7380952380952381, 2.042553191489361, 1.3125...	[0.5238095238095238, 0.2765957446808511, 0.046...	[0.572325778425202, 0.5997909766162336, 0.5936...
1	nmdc:dobj-11-bzg32p17	soil	2320	0.420442	[C8 H14 O2, C8 H16 O2, C6 H10 O4, C7 H14 O3, C...	[1.75, 2.0, 1.6666666666666667, 2.0, 1.8, 1.6,...	[0.25, 0.25, 0.6666666666666666, 0.42857142857...	[0.5813567487211841, 0.5996789440442316, 0.599...
2	nmdc:dobj-11-nvb86885	soil	2466	0.695628	[C6 H10 O3, C6 H12 O3, C7 H6 O3, C7 H10 O3, C7...	[1.6666666666666667, 2.0, 0.8571428571428571, ...	[0.5, 0.5, 0.4285714285714285, 0.4285714285714...	[0.5998304006015176, 0.5999928489476878, 0.599...
3	nmdc:dobj-11-wzyfp957	soil	2969	0.757205	[C7 H6 O2, C5 H7 O3 N1, C6 H10 O3, C6 H12 O3, ...	[0.8571428571428571, 1.4, 1.6666666666666667, ...	[0.2857142857142857, 0.6, 0.5, 0.5, 0.42857142...	[0.5999382633456966, 0.5999987827286737, 0.599...
4	nmdc:dobj-13-9gwc6a87	soil	104	0.832000	[C6 H13 O6 S1 N1, C12 H20 O4, C13 H18 O4, C13 ...	[2.1666666666666665, 1.6666666666666667, 1.384...	[1.0, 0.3333333333333333, 0.3076923076923077, ...	[0.5992879007128323, 0.5999074381538587, 0.599...
...	...	...	...	...	...	...	...	...
195	nmdc:dobj-13-cw804j48	sand	11	0.113402	[C33 H29 O5 S1 N1, C18 H32 O16, C35 H20 O4, C2...	[0.8787878787878788, 1.7777777777777777, 0.571...	[0.1515151515151515, 0.8888888888888888, 0.114...	[0.4776921049377935, 0.4483278164180113, 0.299...
196	nmdc:dobj-13-gatmnn76	sand	37	0.389474	[C50 H36 O7 S1, C33 H50 O19, C43 H36 O10, C52 ...	[0.72, 1.5151515151515151, 0.8372093023255814,...	[0.14, 0.5757575757575758, 0.2325581395348837,...	[0.4773613078718937, 0.521084749258177, 0.2915...
197	nmdc:dobj-13-nsfrf494	sand	13	0.168831	[C36 H40 O13 S1, C41 H33 O8 N1, C31 H38 O16, C...	[1.1111111111111112, 0.8048780487804879, 1.225...	[0.3611111111111111, 0.1951219512195122, 0.516...	[0.4426584900149493, 0.2778580181725424, 0.291...
198	nmdc:dobj-13-qdpv9v42	sand	8	0.163265	[C35 H38 O13, C38 H16, C17 H36 O10 S1, C19 H18...	[1.0857142857142856, 0.4210526315789473, 2.117...	[0.3714285714285714, 0.0, 0.5882352941176471, ...	[0.4727958776734131, 0.0189109588604013, 0.031...
199	nmdc:dobj-13-bdyz9y36	sand	318	0.850267	[C68 H127 O6 S1 N1, C58 H99 O17 N1, C64 H90 O1...	[1.8676470588235292, 1.706896551724138, 1.4062...	[0.088235294117647, 0.293103448275862, 0.17187...	[0.4662624446551346, 0.2880888834088559, 0.455...

	processed	sample_type	assigned_peak_count	assigned_perc	mol_form	H/C	O/C	Confidence Score
0	nmdc:dobj-11-ppa60081	soil	1147	0.648023	C16 H32 O3	2.000000	0.187500	0.599829
1	nmdc:dobj-11-ppa60081	soil	1147	0.648023	C18 H36 O2	2.000000	0.111111	0.597165
2	nmdc:dobj-11-ppa60081	soil	1147	0.648023	C16 H30 O4	1.875000	0.250000	0.594543
3	nmdc:dobj-11-ppa60081	soil	1147	0.648023	C17 H26 O4	1.529412	0.235294	0.599996
4	nmdc:dobj-11-ppa60081	soil	1147	0.648023	C18 H34 O3	1.888889	0.166667	0.599997
...	...	...	...	...	...	...	...	...
46456	nmdc:dobj-13-4z27e492	sand	299	0.810298	C15 H30 O2	2.000000	0.133333	0.594916
46457	nmdc:dobj-13-4z27e492	sand	299	0.810298	C15 H28 O2	1.866667	0.133333	0.592847
46458	nmdc:dobj-13-4z27e492	sand	299	0.810298	C8 H16 O6 S1	2.000000	0.750000	0.046416
46459	nmdc:dobj-13-4z27e492	sand	299	0.810298	C14 H28 O2	2.000000	0.142857	0.596208
46460	nmdc:dobj-13-4z27e492	sand	299	0.810298	C7 H12 O6 S1	1.714286	0.857143	0.006649