import requests
import pandas as pd
from io import StringIO
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)
import nmdc_api_utilities

from nmdc_api_utilities.biosample_search import BiosampleSearch
from nmdc_api_utilities.data_processing import DataProcessing
# Create a BiosampleSearch object
bs_client = BiosampleSearch(env=ENV)
# create a DataProcessing object
dp_client = DataProcessing()
# define the filter
filter = '{"soil_horizon":{"$exists": true}, "geo_loc_name.has_raw_value": {"$regex": "Colorado"}}'
# get the results
bs_results = bs_client.get_record_by_filter(filter=filter, fields="id,soil_horizon,geo_loc_name", max_page_size=100, all_pages=True)
# clarify names
for biosample in bs_results:
    biosample["biosample_id"] = biosample.pop("id")

# convert to df
biosample_df = dp_client.convert_to_df(bs_results)

# Adjust geo_loc_name to not be a dictionary
biosample_df["geo_loc_name"] = biosample_df["geo_loc_name"].apply(lambda x: x.get("has_raw_value"))
biosample_df

# Now get all DataObjects linked to these biosamples
biosample_dataobject_dictionary = bs_client.get_linked_instances_and_associate_ids(
    ids=biosample_df["biosample_id"].tolist(),          # list of biosample ids
    types =["nmdc:DataObject"]                          # specify we want DataObjects only
    )

# The result is a dictionary with biosample ids as keys and list of linked DataObjects as values
print(f"Retrieved DataObjects for {len(biosample_dataobject_dictionary)} biosamples.")

Retrieved DataObjects for 506 biosamples.

# Gather all DataObject ids into a single list (from the dictionary of lists), then get their records
dojs = [item for sublist in biosample_dataobject_dictionary.values() for item in sublist]

# Now get records for all DataObjects records and convert to dataframe
from nmdc_api_utilities.data_object_search import DataObjectSearch
do_client = DataObjectSearch()
do_results = do_client.get_batch_records(
    id_list=dojs,
    search_field="id",
    fields="id,data_object_type,url"
)
data_object_df = dp_client.convert_to_df(do_results)
# Rename id column to data_object_id for clarity
data_object_df = data_object_df.rename(columns={"id": "data_object_id"})

# Finally, filter these DataObjects to only include those with `data_object_type` of "Scaffold Lineage tsv"
filtered_data_object_df = data_object_df[data_object_df["data_object_type"] == "Scaffold Lineage tsv"]
filtered_data_object_df.head(10)

# First define a function to get biosample_id from data_object_id
def get_biosample_id(data_object_id, biosample_dataobject_dict):
    for biosample_id, data_object_list in biosample_dataobject_dict.items():
        if data_object_id in data_object_list:
            return biosample_id
    return None

# Apply the function to add biosample_id column
filtered_data_object_df["biosample_id"] = filtered_data_object_df["data_object_id"].apply(lambda x: get_biosample_id(x, biosample_dataobject_dictionary))

# Merge with biosample_df to get soil_horizon and geo_loc_name
merged_df = pd.merge(filtered_data_object_df, biosample_df, on="biosample_id", how="left")
merged_df.head(10)

# Prepare a final DataFrame that contains unique entries for each DataObject, ensuring that there are no duplicate records. This step is crucial for accurate analysis and reporting, as it prevents any potential bias or errors that could arise from counting the same DataObject multiple times which can result if several biosamples are pooled together in a single metagenome assembly and share the same DataObject.
final_df = merged_df[["data_object_id", "data_object_type", "url","soil_horizon", "geo_loc_name"]].drop_duplicates()
final_df.head(10)

# Show unique soil horizons:
soil_horizons = final_df['soil_horizon'].value_counts()
print(soil_horizons)

soil_horizon
M horizon    177
O horizon     47
Name: count, dtype: int64

# randomly select 15 data sets in each horizon
n = 15

#list the different types
list_type=soil_horizons.index.tolist()

#for each type, randomly horizon n data sets and save them into list
random_subset=[]
for type in list_type:
    #each data object ID and horizon type
    sample_type=final_df[['data_object_id','soil_horizon']].drop_duplicates()
    #filter to current horizon type
    sample_type=sample_type[sample_type['soil_horizon']==type]
    #randomly horizon n data object IDs in current horizon type
    sample_type=sample_type.sample(n=n, random_state=2)
    #save
    random_subset.append(sample_type)

#resave list as dataframe
random_subset=pd.concat(random_subset).reset_index(drop=True)

#remerge rest of the data for the sampled data sets
final_df=random_subset.merge(final_df,on=['data_object_id','soil_horizon'],how="left")

final_df

tsv_ex_url = final_df.iloc[0]["url"]

response = requests.get(tsv_ex_url)
tsv_data = StringIO(response.text)

tsv_ex_df = pd.read_csv(tsv_data, delimiter="\t")
tsv_data.close()

# Give columns names
tsv_ex_df.columns = ["contig_id", "taxa", "initial_count"]

# sort by taxa
tsv_sorted = tsv_ex_df.sort_values(by="taxa")

# print first 10 rows
tsv_sorted[:10]

o_horizon = []
m_horizon = []
errors = []

iteration_counter = 0


for index, row in final_df.iterrows():
    
    iteration_counter += 1

    # print an update for every 50 iterations
    if iteration_counter % 50 == 0:
        print(f"Processed {iteration_counter} rows")

    url = row["url"]
    horizon = row["soil_horizon"]
    dataobj = row["data_object_id"]
    geo_loc = row["geo_loc_name"]
    data_object_id = row["data_object_id"]

    try:
        response = requests.get(url)
        tsv_data = StringIO(response.text)
    
        tsv_df = pd.read_csv(tsv_data, delimiter="\t")
        tsv_data.close()
    
        # Give columns names
        tsv_df.columns = ["contig_id", "taxa", "initial_count"]
    
        # split taxa column into a list where a semicolon (;) is the delimeter
        tsv_df["taxa"] = tsv_df["taxa"].str.split(";")

        # Get only the third element of the list of taxa (the phylum), add "Unknown" it it does not include phylum level, and add
        # "Unkown" if the taxa value is empty.
        tsv_df["taxa"] = tsv_df["taxa"].apply(lambda x: str(x[2]) if isinstance(x, list) and len(x) >= 3 
                                              else str(" ".join(x) + " Unknown") if isinstance(x, list) else "Unknown")


        # Get relative abundance for the tsv_df
        tsv_df = tsv_df.groupby("taxa").size().reset_index(name="count")
        total_count = tsv_df["count"].sum()
        tsv_df["relative_abundance"] = (tsv_df["count"] / total_count) * 100

        # Add geo location to data frame
        tsv_df["geo_loc_name"] = geo_loc

        # Add biosample id to data frame
        tsv_df["data_object_id"] = dataobj
        tsv_df["tsv_url"] = url

        # append tsv_df to list depending on the soil horizon type
        if horizon == "O horizon":
            o_horizon.append(tsv_df)
        else:
            m_horizon.append(tsv_df)

    except Exception as e:
        print(f"An error occurred: {e}")
        errors.append({
            "data_ob_id": dataobj,
            "url": url,
            "horizon": horizon,
            "geo_loc_name": geo_loc
                                    })
        continue

# concatenate list of dfs
o_df = pd.concat(o_horizon)
m_df = pd.concat(m_horizon)

m_df

print(errors)

[]

def taxa_abundance(df):

    df = df.drop_duplicates(subset=['data_object_id', 'taxa'])

    # pivot the table to find all combos of biosample and taxa - set NAs to 0 for relative abundance
    wide_df = df.pivot(index = "data_object_id", columns = "taxa", values = "relative_abundance")
    wide_df = wide_df.fillna(0)
    wide_df.reset_index(inplace=True)
    
    # convert wide_df back with relative_abundances set to 0 for samples that were missing taxa
    melted_df = pd.melt(wide_df, id_vars = "data_object_id", var_name = "taxa", value_name = "relative_abundance")

    # calculate abundance and add column to data frame
    final_df = melted_df.groupby("taxa")["relative_abundance"].mean().reset_index(name="avg_relative_abundance")

    return final_df

# caculate abundance for each soil horizon type and get top 25 taxa, grouping the rest
m_final = taxa_abundance(m_df)
o_final = taxa_abundance(o_df)

# combine data frames
o_final["soil_horizon"] = "O"
m_final["soil_horizon"] = "M"
abundance_df = pd.concat([o_final, m_final])

abundance_df

# Plot the taxa abundance of each soil type
fig = px.bar(abundance_df, x="soil_horizon", y="avg_relative_abundance", color="taxa", 
             title = "% Abundance of phylum-level taxa in M and O horizon soil samples in Colorado", 
             labels = {"soil_horizon": "Soil Horizon", "avg_relative_abundance": "% Abundance"})
    
fig.update_layout(height=600)
fig.show()

def loc_abund(df):

    df = df.drop_duplicates(subset=['data_object_id', 'taxa'])

    # pivot the table to find all combos of biosample and taxa - set NAs to 0 for relative abundance
    wide_df = df.pivot(index = "data_object_id", columns = "taxa", values = "relative_abundance")
    wide_df = wide_df.fillna(0)
    wide_df.reset_index(inplace=True)

    # Add geo_loc_name column to wide_df
    wide_df = pd.merge(wide_df, df[['data_object_id', 'geo_loc_name']], on='data_object_id', how='left')
    
    # convert wide_df back with relative_abundances set to 0 for samples that were missing taxa
    melted_df = pd.melt(wide_df, id_vars=["data_object_id", "geo_loc_name"], var_name="taxa", value_name="relative_abundance")

    final_df = melted_df.groupby(["geo_loc_name", "taxa"])["relative_abundance"].mean().reset_index(name="avg_relative_abundance")

    return final_df

# caculate abundance for each soil horizon type and get top 5 taxa, grouping the rest
m_loc = loc_abund(m_df)
o_loc = loc_abund(o_df)

# combine data frames
o_loc["soil_horizon"] = "O"
m_loc["soil_horizon"] = "M"
loc_abund_df = pd.concat([o_loc, m_loc])

# Extract only region names from geo_loc_name
loc_abund_df["location"] = loc_abund_df["geo_loc_name"].str.extract(r'Colorado, (.*)')

loc_abund_df

geo_fig = px.bar(loc_abund_df, x = "soil_horizon", y="avg_relative_abundance", color = "taxa", 
                 facet_col = "location",
                 facet_col_spacing = 0.1,
                 title = "% Abundance of phylum-level taxa in M and O horizon samples for each Colorado location", 
                 labels = {"geo_loc_name": "Location", "avg_relative_abundance": "% Abundance"},
                 height = 600)
# update figure to remove "location=" from facet column labels
geo_fig.for_each_annotation(lambda a: a.update(text=a.text.replace("location=", "")))

# show figure
geo_fig.show()

	soil_horizon	geo_loc_name	biosample_id
0	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:bsm-11-00m15h97
1	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:bsm-11-06ta8e31
2	O horizon	USA: Colorado, Rocky Mountains	nmdc:bsm-11-06tgpb52
3	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:bsm-11-0asn5d63
4	M horizon	USA: Colorado, North Sterling	nmdc:bsm-11-0djp2e45
...	...	...	...
522	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:bsm-11-zhrzwh12
523	M horizon	USA: Colorado, Niwot Ridge	nmdc:bsm-11-zhzner35
524	O horizon	USA: Colorado, Niwot Ridge	nmdc:bsm-11-zjsrkd21
525	O horizon	USA: Colorado, Niwot Ridge	nmdc:bsm-11-zk6h3328
526	M horizon	USA: Colorado, Niwot Ridge	nmdc:bsm-11-znvc3c66

	data_object_id	data_object_type	url
61	nmdc:dobj-11-nem7e417	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
213	nmdc:dobj-11-58rpty77	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
235	nmdc:dobj-11-c1xp4c62	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
531	nmdc:dobj-11-b6yhf780	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
754	nmdc:dobj-11-ezt80896	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
805	nmdc:dobj-11-1h6mx015	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
837	nmdc:dobj-11-dkz5e809	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...
869	nmdc:dobj-11-re3a2h35	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
979	nmdc:dobj-11-s7dphe48	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...
997	nmdc:dobj-11-zhv2ak95	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...

	data_object_id	data_object_type	url	biosample_id	soil_horizon	geo_loc_name
0	nmdc:dobj-11-nem7e417	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-kjwd7d38	O horizon	USA: Colorado, Niwot Ridge
1	nmdc:dobj-11-58rpty77	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-dt5qbk86	M horizon	USA: Colorado, North Sterling
2	nmdc:dobj-11-c1xp4c62	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-kjwd7d38	O horizon	USA: Colorado, Niwot Ridge
3	nmdc:dobj-11-b6yhf780	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-0yw1rj05	M horizon	USA: Colorado, North Sterling
4	nmdc:dobj-11-ezt80896	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-2jmhwj26	M horizon	USA: Colorado, North Sterling
5	nmdc:dobj-11-1h6mx015	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-7kx9fs40	M horizon	USA: Colorado, North Sterling
6	nmdc:dobj-11-dkz5e809	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	nmdc:bsm-11-2sjea508	M horizon	USA: Colorado, North Sterling
7	nmdc:dobj-11-re3a2h35	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-hwaef990	M horizon	USA: Colorado, Central Plains Experimental Range
8	nmdc:dobj-11-s7dphe48	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-11-kdjqae05	O horizon	USA: Colorado, Rocky Mountains
9	nmdc:dobj-11-zhv2ak95	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	nmdc:bsm-11-32ak7k14	M horizon	USA: Colorado, Niwot Ridge

	data_object_id	data_object_type	url	soil_horizon	geo_loc_name
0	nmdc:dobj-11-nem7e417	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	O horizon	USA: Colorado, Niwot Ridge
1	nmdc:dobj-11-58rpty77	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	M horizon	USA: Colorado, North Sterling
2	nmdc:dobj-11-c1xp4c62	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	O horizon	USA: Colorado, Niwot Ridge
3	nmdc:dobj-11-b6yhf780	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	M horizon	USA: Colorado, North Sterling
4	nmdc:dobj-11-ezt80896	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	M horizon	USA: Colorado, North Sterling
5	nmdc:dobj-11-1h6mx015	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	M horizon	USA: Colorado, North Sterling
6	nmdc:dobj-11-dkz5e809	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	M horizon	USA: Colorado, North Sterling
7	nmdc:dobj-11-re3a2h35	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	M horizon	USA: Colorado, Central Plains Experimental Range
8	nmdc:dobj-11-s7dphe48	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	O horizon	USA: Colorado, Rocky Mountains
9	nmdc:dobj-11-zhv2ak95	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	M horizon	USA: Colorado, Niwot Ridge

	data_object_id	soil_horizon	data_object_type	url	geo_loc_name
0	nmdc:dobj-11-awzdgz18	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
1	nmdc:dobj-11-f77hjz36	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
2	nmdc:dobj-11-w0dh2350	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Rocky Mountains
3	nmdc:dobj-11-s9k3d639	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
4	nmdc:dobj-11-1apwza69	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
5	nmdc:dobj-11-xe3ayr72	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, North Sterling
6	nmdc:dobj-11-d5qqp631	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	USA: Colorado, Central Plains Experimental Range
7	nmdc:dobj-11-gy3dpy67	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
8	nmdc:dobj-11-xepjq837	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
9	nmdc:dobj-11-wh0g6g42	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
10	nmdc:dobj-11-nq8g1d37	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, North Sterling
11	nmdc:dobj-11-1h6mx015	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, North Sterling
12	nmdc:dobj-11-zhv2ak95	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	USA: Colorado, Niwot Ridge
13	nmdc:dobj-11-tjezch87	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Central Plains Experimental Range
14	nmdc:dobj-11-g87j5y46	M horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
15	nmdc:dobj-11-v1d0fe44	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
16	nmdc:dobj-11-gargwe62	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
17	nmdc:dobj-11-pe7v1f12	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
18	nmdc:dobj-11-wkznj445	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
19	nmdc:dobj-11-parcbj38	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
20	nmdc:dobj-11-z73a1f14	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Rocky Mountains
21	nmdc:dobj-11-jp45gr33	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Rocky Mountains
22	nmdc:dobj-11-k6xn1112	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	USA: Colorado, Rocky Mountains
23	nmdc:dobj-11-r9jyj090	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	USA: Colorado, Niwot Ridge
24	nmdc:dobj-11-nem7e417	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
25	nmdc:dobj-11-s7dphe48	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Rocky Mountains
26	nmdc:dobj-11-05tb2m57	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	USA: Colorado, Niwot Ridge
27	nmdc:dobj-11-0fzrkr83	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge
28	nmdc:dobj-11-msmkjb83	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:dgns...	USA: Colorado, Niwot Ridge
29	nmdc:dobj-11-ht5msd46	O horizon	Scaffold Lineage tsv	https://data.microbiomedata.org/data/nmdc:ompr...	USA: Colorado, Niwot Ridge

How does the taxonomic distribution of contigs differ by soil layer (mineral vs organic) in Colorado?¶

1. Get all biosamples where `soil_horizon` exists and the `geo_loc_name` has "Colorado" in the name¶

3. Get records for all DataObjects found in step 2.¶

4. Associate DataObjects back to Biosamples¶

5. Dereplicate the DataFrame based on data_object_id to ensure uniqueness¶

Show how many results have M horizon vs. O horizon¶

Randomly select a subset of these datasets for which to pull information¶

Example of what the TSV contig taxa file looks like¶

Iterate throught the TSVs to get the contig taxa information¶

Look into any errors that occurred from the TSV requests¶

Define a function to calculate abundance¶

Calculate the abundance of the O and M horizon data frames¶

Plot the taxa abundance of M vs. O horizon soil samples¶

Write a function to calculate the abundance per location¶

Calculate the abundance of the location data frames¶

Plot the taxa abundance of M and O horizon soil samples for each location¶

	contig_id	taxa	initial_count
49938	nmdc:wfmgan-11-rzkbkv30.1_052893	Archaea;Candidatus Bathyarchaeota;unclassified...	1.0
111207	nmdc:wfmgan-11-rzkbkv30.1_123666	Archaea;Candidatus Bathyarchaeota;unclassified...	1.0
158392	nmdc:wfmgan-11-rzkbkv30.1_181733	Archaea;Candidatus Bathyarchaeota;unclassified...	1.0
18540	nmdc:wfmgan-11-rzkbkv30.1_019019	Archaea;Candidatus Bathyarchaeota;unclassified...	1.0
182091	nmdc:wfmgan-11-rzkbkv30.1_211849	Archaea;Candidatus Diapherotrites;unclassified...	1.0
209047	nmdc:wfmgan-11-rzkbkv30.1_246642	Archaea;Candidatus Korarchaeota;unclassified C...	1.0
193220	nmdc:wfmgan-11-rzkbkv30.1_226145	Archaea;Candidatus Korarchaeota;unclassified C...	1.0
110323	nmdc:wfmgan-11-rzkbkv30.1_122598	Archaea;Candidatus Lokiarchaeota;Candidatus Lo...	1.0
204206	nmdc:wfmgan-11-rzkbkv30.1_240384	Archaea;Candidatus Micrarchaeota;unclassified ...	1.0
31485	nmdc:wfmgan-11-rzkbkv30.1_032720	Archaea;Candidatus Nanohaloarchaeota;Candidatu...	1.0

	taxa	count	relative_abundance	geo_loc_name	data_object_id	tsv_url
0	Acidimicrobiia	3131	1.435021	USA: Colorado, Central Plains Experimental Range	nmdc:dobj-11-awzdgz18	https://data.microbiomedata.org/data/nmdc:ompr...
1	Acidithiobacillia	41	0.018791	USA: Colorado, Central Plains Experimental Range	nmdc:dobj-11-awzdgz18	https://data.microbiomedata.org/data/nmdc:ompr...
2	Actinomycetes	106557	48.837913	USA: Colorado, Central Plains Experimental Range	nmdc:dobj-11-awzdgz18	https://data.microbiomedata.org/data/nmdc:ompr...
3	Agaricomycetes	17	0.007792	USA: Colorado, Central Plains Experimental Range	nmdc:dobj-11-awzdgz18	https://data.microbiomedata.org/data/nmdc:ompr...
4	Alphaproteobacteria	17620	8.075716	USA: Colorado, Central Plains Experimental Range	nmdc:dobj-11-awzdgz18	https://data.microbiomedata.org/data/nmdc:ompr...
...	...	...	...	...	...	...
280	unclassified Zoopagomycota	10	0.000262	USA: Colorado, Niwot Ridge	nmdc:dobj-11-g87j5y46	https://data.microbiomedata.org/data/nmdc:ompr...
281	unclassified candidate division NC10	3522	0.092359	USA: Colorado, Niwot Ridge	nmdc:dobj-11-g87j5y46	https://data.microbiomedata.org/data/nmdc:ompr...
282	unclassified candidate division Zixibacteria	495	0.012981	USA: Colorado, Niwot Ridge	nmdc:dobj-11-g87j5y46	https://data.microbiomedata.org/data/nmdc:ompr...
283	unclassified dsDNA viruses, no RNA stage	3	0.000079	USA: Colorado, Niwot Ridge	nmdc:dobj-11-g87j5y46	https://data.microbiomedata.org/data/nmdc:ompr...
284	unclassified viruses	2	0.000052	USA: Colorado, Niwot Ridge	nmdc:dobj-11-g87j5y46	https://data.microbiomedata.org/data/nmdc:ompr...

	taxa	avg_relative_abundance	soil_horizon
0	Acidimicrobiia	0.369256	O
1	Acidithiobacillia	0.028836	O
2	Aconoidasida	0.001893	O
3	Actinomycetes	27.290731	O
4	Actinopteri	0.001155	O
...	...	...	...
288	unclassified Zoopagomycota	0.000379	M
289	unclassified candidate division NC10	0.105021	M
290	unclassified candidate division Zixibacteria	0.015302	M
291	unclassified dsDNA viruses, no RNA stage	0.000240	M
292	unclassified viruses	0.000003	M

	geo_loc_name	taxa	avg_relative_abundance	soil_horizon	location
0	USA: Colorado, Niwot Ridge	Acidimicrobiia	0.418534	O	Niwot Ridge
1	USA: Colorado, Niwot Ridge	Acidithiobacillia	0.029035	O	Niwot Ridge
2	USA: Colorado, Niwot Ridge	Aconoidasida	0.002297	O	Niwot Ridge
3	USA: Colorado, Niwot Ridge	Actinomycetes	28.213548	O	Niwot Ridge
4	USA: Colorado, Niwot Ridge	Actinopteri	0.001028	O	Niwot Ridge
...	...	...	...	...	...
1167	USA: Colorado, Rocky Mountains	unclassified Zoopagomycota	0.000000	M	Rocky Mountains
1168	USA: Colorado, Rocky Mountains	unclassified candidate division NC10	0.131173	M	Rocky Mountains
1169	USA: Colorado, Rocky Mountains	unclassified candidate division Zixibacteria	0.038580	M	Rocky Mountains
1170	USA: Colorado, Rocky Mountains	unclassified dsDNA viruses, no RNA stage	0.000000	M	Rocky Mountains
1171	USA: Colorado, Rocky Mountains	unclassified viruses	0.000000	M	Rocky Mountains

How does the taxonomic distribution of contigs differ by soil layer (mineral vs organic) in Colorado?¶

1. Get all biosamples where soil_horizon exists and the geo_loc_name has "Colorado" in the name¶

2. Get all DataObjects that are related to the biosamples found in step 1.¶

3. Get records for all DataObjects found in step 2.¶

4. Associate DataObjects back to Biosamples¶

5. Dereplicate the DataFrame based on data_object_id to ensure uniqueness¶

Show how many results have M horizon vs. O horizon¶

Randomly select a subset of these datasets for which to pull information¶

Example of what the TSV contig taxa file looks like¶

Iterate throught the TSVs to get the contig taxa information¶

Look into any errors that occurred from the TSV requests¶

Define a function to calculate abundance¶

Calculate the abundance of the O and M horizon data frames¶

Plot the taxa abundance of M vs. O horizon soil samples¶

Write a function to calculate the abundance per location¶

Calculate the abundance of the location data frames¶

Plot the taxa abundance of M and O horizon soil samples for each location¶

1. Get all biosamples where `soil_horizon` exists and the `geo_loc_name` has "Colorado" in the name¶