# Load essential libraries
library(jsonlite, warn.conflicts=FALSE)
library(dplyr, warn.conflicts=FALSE)
library(tidyr, warn.conflicts=FALSE)
library(readr, warn.conflicts=FALSE)
library(ggplot2, warn.conflicts=FALSE)

if(Sys.getenv("COLAB_BACKEND_VERSION") == "") source("../../utility_functions.R")

if(Sys.getenv("COLAB_BACKEND_VERSION") != "") source("http://raw.githubusercontent.com/microbiomedata/nmdc_notebooks/refs/heads/main/utility_functions.R")

# Get biosamples using get_all_results function
biosample_df <- get_all_results(
    collection = 'biosample_set', 
    filter_text = '{"soil_horizon":{"$exists": true}, "geo_loc_name.has_raw_value": {"$regex": "Colorado"}}', 
    max_page_size = 100, 
    fields = 'id,soil_horizon,geo_loc_name'
    )

# Clarify the column names
biosample_df <- biosample_df %>%
    unnest(cols = geo_loc_name, names_sep = "_") %>% 
    rename(biosample_id = id,
           geo_loc_name = geo_loc_name_has_raw_value) %>%
    distinct()
head(biosample_df)

pooling_df <- get_results_by_id(
    collection = 'material_processing_set',
    match_id_field = 'has_input',
    id_list = biosample_df$biosample_id,
    fields = 'id,has_input,has_output',
    max_page_size = 20
)

# Unnest the has_input and has_output columns, get unique results, and rename the columns.
pooling_df2 <- pooling_df %>%
    unnest(cols = c(has_input, has_output), names_sep = "_") %>%
    distinct() %>%
    rename(pooling_id = id,
           biosample_id = has_input,
           pooling_has_output = has_output) %>%
    filter(grepl(pattern = "poolp", x = pooling_id, fixed = TRUE))
head(pooling_df2)

biosample_df2 <- left_join(biosample_df, pooling_df2, by = 'biosample_id') %>%
    filter(!is.na(pooling_id))
head(biosample_df2)

biosample_df3 <- biosample_df2 %>%
    rename(processed_sample_id = pooling_has_output) 
head(biosample_df3)

extraction_df <- get_results_by_id(
    collection = 'material_processing_set',
    match_id_field = 'has_input',
    id_list = unique(biosample_df3$processed_sample_id),
    fields = 'id,has_input,has_output',
    max_page_size = 20
)

extraction_df <- extraction_df %>%
    unnest(cols = c(has_input, has_output), names_sep = "_") %>%
    distinct() %>%
    rename(extraction_id = id,
           processed_sample_id = has_input,
           extraction_has_output = has_output)
head(extraction_df)

biosample_df4 <- biosample_df3 %>%
    left_join(extraction_df, by = join_by(processed_sample_id))
head(biosample_df4)

biosample_df5 <- biosample_df4 %>%
    rename(processed_sample_id2 = extraction_has_output)
head(biosample_df5)

library_prep_df <- get_results_by_id(
    collection = 'material_processing_set',
    match_id_field = 'has_input',
    id_list = unique(biosample_df5$processed_sample_id2),
    fields = 'id,has_input,has_output',
    max_page_size = 20
)

library_prep_df <- library_prep_df %>%
    unnest(cols = c(has_input,has_output), names_sep = "_") %>%
    distinct() %>%
    rename(library_preparation_id = id,
           processed_sample_id2 = has_input,
           library_preparation_has_output = has_output)
head(library_prep_df)

biosample_df6 <- biosample_df5 %>%
    left_join(library_prep_df, by = join_by(processed_sample_id2))
head(biosample_df6)

biosample_df7 <- biosample_df6 %>%
    rename(processed_sample_id3 = library_preparation_has_output)
head(biosample_df7)

data_generation_df <- get_results_by_id(
    collection = 'data_generation_set',
    match_id_field = 'has_input',
    id_list = unique(biosample_df7$processed_sample_id3),
    fields = 'id,has_input',
    max_page_size = 20
)

data_generation_df <- data_generation_df %>%
    unnest(cols = c(has_input), names_sep = "_") %>%
    rename(data_generation_id = id,
           processed_sample_id3 = has_input)
head(data_generation_df)

biosample_df8 <- biosample_df7 %>%
    left_join(data_generation_df, by = join_by(processed_sample_id3)) %>%
    filter(!is.na(data_generation_id))
head(biosample_df8)

metagenome_annotation_df <- get_results_by_id(
    collection = 'workflow_execution_set',
    match_id_field = 'was_informed_by',
    id_list = unique(biosample_df8$data_generation_id),
    fields = 'id,was_informed_by,has_output,type,version',
    max_page_size = 20
    ) 
    
head(metagenome_annotation_df)

unique(metagenome_annotation_df$type)

metagenome_annotation_df <- metagenome_annotation_df %>%
    filter(type == "nmdc:MetagenomeAnnotation") %>%
    unnest(
        cols = c(
            was_informed_by,
            has_output
        ), names_sep = "_") %>%
    rename(metagenome_annotation_id = id,
           data_generation_id = was_informed_by,
           matagenome_annotation_has_output = has_output,
           workflow_type = type) %>%
    distinct()
head(metagenome_annotation_df)
nrow(metagenome_annotation_df)

biosample_df9 <- biosample_df8 %>%
    left_join(metagenome_annotation_df, by = join_by(data_generation_id), relationship = "many-to-many") %>%
    distinct() %>%
    filter(!is.na(metagenome_annotation_id))
head(biosample_df9)

data_object_df <- get_results_by_id(
    collection = 'data_object_set',
    match_id_field = 'id',
    id_list = unique(biosample_df9$matagenome_annotation_has_output),
    fields = 'id,data_object_type,url',
    max_page_size = 50
)

# Filter the data object results to only include the Scaffold Lineage tsv files
data_object_df <- data_object_df %>%
    rename(data_object_id = id) %>%
    filter(data_object_type == 'Scaffold Lineage tsv')
head(data_object_df)

biosample_df10 <- biosample_df9 %>%
    rename(data_object_id = matagenome_annotation_has_output) %>%
    left_join(data_object_df, by = join_by(data_object_id)) %>%
    distinct() %>%
    filter(!is.na(url))
head(biosample_df10)

biosample_df_final <- biosample_df10 %>%
    select(biosample_id, soil_horizon, geo_loc_name, data_object_id, data_object_type, url, version) %>%
    distinct() %>%
    filter(!is.na(url)) %>%
    # Filter to a single workflow version
    filter(version == "v1.0.4")
head(biosample_df_final)

biosample_df_final %>%
    count(soil_horizon)

url <- biosample_df_final$url[1]

# Read the TSV file
contig_taxa_df <- read_tsv(url, col_names = FALSE, show_col_types = FALSE)

# Add column names 
colnames(contig_taxa_df) <- c('contig_id', 'taxa', 'initial_count')

# Show the first few rows
head(contig_taxa_df)

set.seed(413)
urls <- biosample_df_final %>%
  distinct(url, .keep_all = TRUE) %>%
  group_by(soil_horizon) %>%
  slice_sample(n = 15) %>%
  ungroup() %>%
  pull(url)

#urls <- unique(biosample_df_final$url)
results_list <- c()
error_dict <- list()

for (i in 1:length(urls)) {
    # if i a factor of 100, print the progress
    if (i %% 10 == 0) {
        print(paste('Processing', i, 'of', length(urls)))
    }
    url <- urls[i]
    tryCatch({
        contig_taxa_df <- read_tsv(url, col_names = FALSE, show_col_types = FALSE)
        colnames(contig_taxa_df) <- c('contig_id', 'taxa', 'initial_count')
        
        # Clean up the taxa column and deal with unknown taxa
        contig_taxa_df$taxa_new <- contig_taxa_df$taxa
        contig_taxa_df$taxa_new <- sapply(strsplit(contig_taxa_df$taxa_new, ';'), function(x) x[3])
        contig_taxa_df$taxa_new <- ifelse(
            is.na(contig_taxa_df$taxa_new), 
            paste('Unknown', sapply(strsplit(contig_taxa_df$taxa, ';'), function(x) x[2])), 
            contig_taxa_df$taxa_new)
        contig_taxa_df$taxa_new <- ifelse(
            contig_taxa_df$taxa_new == "Unknown NA", 
            paste('Unknown', sapply(strsplit(contig_taxa_df$taxa, ';'), function(x) x[1])), 
            contig_taxa_df$taxa_new)
        contig_taxa_df$taxa <- contig_taxa_df$taxa_new

        contig_taxa_df <- contig_taxa_df %>%
            group_by(taxa) %>%
            summarise(count = n()) %>%
            mutate(relative_abundance = count / sum(count))

        # Add the queried url to the dataframe for later joining
        contig_taxa_df$url <- url
        results_list[[i]] <- contig_taxa_df

    }, error = function(e) {
        error_dict[[i]] <- e
    })
}

# Combine results into single dataframe
contig_df <- bind_rows(results_list) 

head(contig_df)

[1] "Processing 10 of 30"
[1] "Processing 20 of 30"
[1] "Processing 30 of 30"

# First merge to get the url for geo_loc_name and soil_horizon
biosample_taxa_df <- biosample_df_final %>%
    select(soil_horizon, geo_loc_name, url) %>%
    distinct() %>%
    right_join(contig_df, by = join_by(url))

# Then pivot the table to fill in the relative abundance as zero for un-observed taxa
biosample_taxa_df_wide <- biosample_taxa_df %>%
    pivot_wider(id_cols = c(url, soil_horizon, geo_loc_name),
        names_from = taxa, values_from = relative_abundance) %>%
    replace(is.na(.), 0)

# And unpivot the table to get the taxa relative abundance for each biosample
biosample_taxa_df <- biosample_taxa_df_wide %>%
    pivot_longer(cols = -c(url, soil_horizon, geo_loc_name), names_to = 'taxa', values_to = 'relative_abundance')

options(dplyr.summarise.inform = FALSE)
horizon_taxa <- biosample_taxa_df %>%
    group_by(soil_horizon, taxa) %>%
    summarise(mean_relative_abundance = mean(relative_abundance))%>%
    arrange(mean_relative_abundance) %>%
    mutate(taxa = factor(taxa, levels = rev(unique(taxa)))) %>%
    mutate(taxa_lump = forcats::fct_other(taxa, keep = levels(taxa)[1:15], other_level = 'Other')) 
           
# Make color palette that is 9 colors long, and followed with grey
color_pal <- c(RColorBrewer::brewer.pal(8, 'Set1'), RColorBrewer::brewer.pal(7, 'Set3'), 'grey')
g <- ggplot(horizon_taxa, aes(x = soil_horizon, y = mean_relative_abundance, fill = taxa_lump)) +
    geom_bar(stat = 'identity', color = NA) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
    labs(title = 'Taxa abundance of M and O horizon soil samples',
         x = 'Soil Horizon', y = 'Mean relative abundance', fill = NULL) +
    scale_fill_manual(values = color_pal) +
    theme_minimal() 
options(repr.plot.width = 7, repr.plot.height = 7, repr.plot.res = 250)
g

geo_taxa <- biosample_taxa_df %>%
    group_by(geo_loc_name, soil_horizon, taxa) %>%
    summarise(mean_relative_abundance = mean(relative_abundance)) %>%
    arrange(mean_relative_abundance) %>%
    mutate(taxa = factor(taxa, levels = rev(unique(taxa)))) %>%
    mutate(taxa_lump = forcats::fct_other(taxa, keep = levels(taxa)[1:15], other_level = 'Other')) %>%
    mutate(soil_horizon = factor(soil_horizon, levels = c('M horizon', 'O horizon'), labels = c('M', 'O')))

g <- ggplot(geo_taxa, aes(x = soil_horizon, y = mean_relative_abundance, fill = taxa_lump)) +
    geom_bar(stat = 'identity', color = NA) +
    facet_wrap(~geo_loc_name, nrow = 1,labeller =  label_wrap_gen(width = 20, multi_line = TRUE)) +
    labs(title = 'Taxa abundance of M and O horizon soil samples for each location',
         x = 'Soil Horizon', y = 'Mean relative abundance', fill = NULL) +
    scale_fill_manual(values = color_pal) +
    theme_minimal()+
    theme(axis.text.x = element_text(angle = 90, hjust = 1),
          legend.position = "bottom") 
options(repr.plot.width = 9, repr.plot.height = 7, repr.plot.res = 250)
g

biosample_id	soil_horizon	geo_loc_name	geo_loc_name_type
<chr>	<chr>	<chr>	<chr>
nmdc:bsm-11-00m15h97	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue
nmdc:bsm-11-06ta8e31	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue
nmdc:bsm-11-06tgpb52	O horizon	USA: Colorado, Rocky Mountains	nmdc:TextValue
nmdc:bsm-11-0asn5d63	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue
nmdc:bsm-11-0djp2e45	M horizon	USA: Colorado, North Sterling	nmdc:TextValue
nmdc:bsm-11-0f43ab20	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue

biosample_id	pooling_has_output	pooling_id
<chr>	<chr>	<chr>
nmdc:bsm-11-ex491068	nmdc:procsm-11-kngzyt90	nmdc:poolp-11-sj9jpg87
nmdc:bsm-11-1byjjh32	nmdc:procsm-11-kngzyt90	nmdc:poolp-11-sj9jpg87
nmdc:bsm-11-da5wpm57	nmdc:procsm-11-kngzyt90	nmdc:poolp-11-sj9jpg87
nmdc:bsm-11-71vqzv35	nmdc:procsm-11-mr5hf033	nmdc:poolp-11-ay38nw70
nmdc:bsm-11-0f43ab20	nmdc:procsm-11-mr5hf033	nmdc:poolp-11-ay38nw70
nmdc:bsm-11-a5z5pe39	nmdc:procsm-11-mr5hf033	nmdc:poolp-11-ay38nw70

biosample_id	soil_horizon	geo_loc_name	geo_loc_name_type	pooling_has_output	pooling_id
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
nmdc:bsm-11-00m15h97	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-ytthx235	nmdc:poolp-11-gxv2dy50
nmdc:bsm-11-06ta8e31	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-5s07gt34	nmdc:poolp-11-5e2asm75
nmdc:bsm-11-06tgpb52	O horizon	USA: Colorado, Rocky Mountains	nmdc:TextValue	nmdc:procsm-11-ez7edj21	nmdc:poolp-11-qq41ss20
nmdc:bsm-11-0asn5d63	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-y8w3sk61	nmdc:poolp-11-pak1ws91
nmdc:bsm-11-0djp2e45	M horizon	USA: Colorado, North Sterling	nmdc:TextValue	nmdc:procsm-11-258vbz70	nmdc:poolp-11-vfkwpy98
nmdc:bsm-11-0f43ab20	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-mr5hf033	nmdc:poolp-11-ay38nw70

biosample_id	soil_horizon	geo_loc_name	geo_loc_name_type	processed_sample_id	pooling_id
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
nmdc:bsm-11-00m15h97	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-ytthx235	nmdc:poolp-11-gxv2dy50
nmdc:bsm-11-06ta8e31	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-5s07gt34	nmdc:poolp-11-5e2asm75
nmdc:bsm-11-06tgpb52	O horizon	USA: Colorado, Rocky Mountains	nmdc:TextValue	nmdc:procsm-11-ez7edj21	nmdc:poolp-11-qq41ss20
nmdc:bsm-11-0asn5d63	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-y8w3sk61	nmdc:poolp-11-pak1ws91
nmdc:bsm-11-0djp2e45	M horizon	USA: Colorado, North Sterling	nmdc:TextValue	nmdc:procsm-11-258vbz70	nmdc:poolp-11-vfkwpy98
nmdc:bsm-11-0f43ab20	M horizon	USA: Colorado, Central Plains Experimental Range	nmdc:TextValue	nmdc:procsm-11-mr5hf033	nmdc:poolp-11-ay38nw70

processed_sample_id	extraction_has_output	extraction_id
<chr>	<chr>	<chr>
nmdc:procsm-11-49bwy122	nmdc:procsm-11-kwaaah42	nmdc:extrp-11-fsv8td81
nmdc:procsm-11-kngzyt90	nmdc:procsm-11-h9s7h174	nmdc:extrp-11-v25scb12
nmdc:procsm-11-mr5hf033	nmdc:procsm-11-7qy2y664	nmdc:extrp-11-gnvf5s35
nmdc:procsm-11-33n4p085	nmdc:procsm-11-6xc6vy98	nmdc:extrp-11-j5qc7973
nmdc:procsm-11-2fxf0e98	nmdc:procsm-11-x763xr38	nmdc:extrp-11-y5ewyv43
nmdc:procsm-11-y8w3sk61	nmdc:procsm-11-q086v208	nmdc:extrp-11-9qd5ke92

How does the taxonomic distribution of contigs differ by soil layer (mineral vs organic) in Colorado?¶

1. Get all biosamples where soil_horizon exists and the geo_loc_name has "Colorado" in the name¶

2. Get all Pooling results where the Pooling `has_input` are the biosample ids¶

3. Get `Extraction` records where `processed_sample_id` identifier is the `has_input` to the `Extraction`¶

4. Get the `LibraryPreparation` records¶

5. Get `NucleotideSequencing` records from the processed sample identifiers¶

6. Get the `MetagenomeAnnotation` ids using the `DataGeneration` identifiers¶

7. Get data objects from the metagenome activity result outputs¶

Clean up the combined results¶

Show how many results have M horizon vs. O horizon¶

Example of what the TSV contig taxa file looks like¶

Randomly select a subset of samples for which to download taxonomy files¶

Iterate throught the TSVs to get the contig taxa information¶

Clean up the relative abundance data to fill in NAs with 0 for unobserved taxa¶

Plot the average taxa abundance for all M and O horizon soil samples¶

Plot the taxa abundance of M and O horizon soil samples for each location¶

processed_sample_id3	data_generation_id
<chr>	<chr>
nmdc:procsm-11-3s5m9a70	nmdc:omprc-11-2937gz63
nmdc:procsm-11-43n6yz70	nmdc:omprc-11-g1n61y55
nmdc:procsm-11-44e5ds31	nmdc:dgns-11-ekte1238
nmdc:procsm-11-4jj6k690	nmdc:omprc-11-yt96hb84
nmdc:procsm-11-4z512838	nmdc:omprc-11-afejca38
nmdc:procsm-11-7cpyc435	nmdc:omprc-11-by9r5p41

	id	type	has_output	was_informed_by	version
	<chr>	<chr>	<list>	<list>	<chr>
1	nmdc:wfmag-11-00y2a531.1	nmdc:MagsAnalysis	nmdc:dobj-11-6zjxax41, nmdc:dobj-11-v6gbd993, nmdc:dobj-11-2b8tp616, nmdc:dobj-11-5d17py07, nmdc:dobj-11-zjd78797, nmdc:dobj-11-75q2x318, nmdc:dobj-11-614npz60, nmdc:dobj-11-0w7eb916, nmdc:dobj-11-9fgk4p31	nmdc:omprc-11-htac6662	v1.3.16
2	nmdc:wfmag-11-2araz898.1	nmdc:MagsAnalysis	nmdc:dobj-11-qcjne029, nmdc:dobj-11-v6v4az06, nmdc:dobj-11-99ze1q07, nmdc:dobj-11-82mxmg53, nmdc:dobj-11-tdk66274, nmdc:dobj-11-x0k49271, nmdc:dobj-11-hv9we043, nmdc:dobj-11-hbmfj707, nmdc:dobj-11-970hbm36	nmdc:omprc-11-63ajbd04	v1.3.16
3	nmdc:wfmag-11-3pdsac32.1	nmdc:MagsAnalysis	nmdc:dobj-11-bjarz597, nmdc:dobj-11-5sa1hc37, nmdc:dobj-11-ypaebx18, nmdc:dobj-11-f4v00f74, nmdc:dobj-11-pgtn0f33, nmdc:dobj-11-ftexk841, nmdc:dobj-11-v80h9572, nmdc:dobj-11-rcrgkr35, nmdc:dobj-11-13a4c730	nmdc:dgns-11-j3w06312	v1.3.16
4	nmdc:wfmag-11-6beb5p13.1	nmdc:MagsAnalysis	nmdc:dobj-11-te4k2925, nmdc:dobj-11-7n3mbm30, nmdc:dobj-11-804v6747, nmdc:dobj-11-rfr5yn37, nmdc:dobj-11-7mb5fc28, nmdc:dobj-11-9tmx4w60, nmdc:dobj-11-6b2ytn94, nmdc:dobj-11-meg6gk11, nmdc:dobj-11-m78rp150	nmdc:dgns-11-jxh3ht55	v1.3.16
5	nmdc:wfmag-11-7628dd79.1	nmdc:MagsAnalysis	nmdc:dobj-11-y1es6k19, nmdc:dobj-11-2542ex33, nmdc:dobj-11-8zz56n07, nmdc:dobj-11-av2mf726, nmdc:dobj-11-8bega838, nmdc:dobj-11-t551g482, nmdc:dobj-11-vkd6b435, nmdc:dobj-11-5zkkqt82, nmdc:dobj-11-rrn00m11	nmdc:dgns-11-ekte1238	v1.3.16
6	nmdc:wfmag-11-7w0dea36.1	nmdc:MagsAnalysis	nmdc:dobj-11-72x3pe97, nmdc:dobj-11-1357qx92, nmdc:dobj-11-vqbe9z28, nmdc:dobj-11-pzxbjr81, nmdc:dobj-11-md40ag65, nmdc:dobj-11-a259cq41, nmdc:dobj-11-52va9x46, nmdc:dobj-11-jv4sz961, nmdc:dobj-11-gf8d1x07	nmdc:omprc-11-sz2d4412	v1.3.16

contig_id	taxa	initial_count
<chr>	<chr>	<dbl>
nmdc:wfmgas-11-qdbye406.1_scf_10000_c1	Bacteria;Pseudomonadota;Alphaproteobacteria;Hyphomicrobiales;Nitrobacteraceae;Bradyrhizobium;Bradyrhizobium sp. KBS0725;Bradyrhizobium sp. KBS0725	1
nmdc:wfmgas-11-qdbye406.1_scf_10001_c1	Bacteria;Pseudomonadota;Alphaproteobacteria;Hyphomicrobiales;Propylenellaceae;Propylenella;Propylenella binzhouense;Propylenella binzhouense L72	1
nmdc:wfmgas-11-qdbye406.1_scf_10002_c1	Bacteria;Pseudomonadota;Alphaproteobacteria;Hyphomicrobiales;Nitrobacteraceae;Tardiphaga;Tardiphaga robiniae;Tardiphaga robiniae 1155	1
nmdc:wfmgas-11-qdbye406.1_scf_10003_c1	Bacteria;Pseudomonadota;Alphaproteobacteria;Hyphomicrobiales;Nitrobacteraceae;Bradyrhizobium;Bradyrhizobium lablabi;Bradyrhizobium lablabi GAS165	1
nmdc:wfmgas-11-qdbye406.1_scf_10004_c1	Bacteria;Pseudomonadota;Alphaproteobacteria;Hyphomicrobiales;Nitrobacteraceae;Bradyrhizobium;Bradyrhizobium sp. SRL28;Bradyrhizobium sp. SRL28	1
nmdc:wfmgas-11-qdbye406.1_scf_10005_c1	Bacteria;Pseudomonadota;Alphaproteobacteria;Hyphomicrobiales;Nitrobacteraceae;Bradyrhizobium;Bradyrhizobium sp. AUGA SZCCT0283;Bradyrhizobium sp. AUGA SZCCT0283	1

taxa	count	relative_abundance	url
<chr>	<int>	<dbl>	<chr>
Acidimicrobiia	45	0.0024050024	https://data.microbiomedata.org/data/nmdc:omprc-11-qxq15411/nmdc:wfmgan-11-9v130b74.1/nmdc_wfmgan-11-9v130b74.1_scaffold_lineage.tsv
Acidithiobacillia	10	0.0005344450	https://data.microbiomedata.org/data/nmdc:omprc-11-qxq15411/nmdc:wfmgan-11-9v130b74.1/nmdc_wfmgan-11-9v130b74.1_scaffold_lineage.tsv
Actinomycetes	3057	0.1633798300	https://data.microbiomedata.org/data/nmdc:omprc-11-qxq15411/nmdc:wfmgan-11-9v130b74.1/nmdc_wfmgan-11-9v130b74.1_scaffold_lineage.tsv
Agaricomycetes	7	0.0003741115	https://data.microbiomedata.org/data/nmdc:omprc-11-qxq15411/nmdc:wfmgan-11-9v130b74.1/nmdc_wfmgan-11-9v130b74.1_scaffold_lineage.tsv
Alphaproteobacteria	1821	0.0973224307	https://data.microbiomedata.org/data/nmdc:omprc-11-qxq15411/nmdc:wfmgan-11-9v130b74.1/nmdc_wfmgan-11-9v130b74.1_scaffold_lineage.tsv
Anaerolineae	23	0.0012292235	https://data.microbiomedata.org/data/nmdc:omprc-11-qxq15411/nmdc:wfmgan-11-9v130b74.1/nmdc_wfmgan-11-9v130b74.1_scaffold_lineage.tsv

A tibble: 6 × 5
metagenome_annotation_id	workflow_type	matagenome_annotation_has_output	data_generation_id	version
<chr>	<chr>	<chr>	<chr>	<chr>
nmdc:wfmgan-11-05cdqw41.1	nmdc:MetagenomeAnnotation	nmdc:dobj-11-ndsyd761	nmdc:dgns-11-ekte1238	v1.1.5
nmdc:wfmgan-11-05cdqw41.1	nmdc:MetagenomeAnnotation	nmdc:dobj-11-ss1k0e30	nmdc:dgns-11-ekte1238	v1.1.5
nmdc:wfmgan-11-05cdqw41.1	nmdc:MetagenomeAnnotation	nmdc:dobj-11-m5g68x38	nmdc:dgns-11-ekte1238	v1.1.5
nmdc:wfmgan-11-05cdqw41.1	nmdc:MetagenomeAnnotation	nmdc:dobj-11-22449330	nmdc:dgns-11-ekte1238	v1.1.5
nmdc:wfmgan-11-05cdqw41.1	nmdc:MetagenomeAnnotation	nmdc:dobj-11-t33sqd79	nmdc:dgns-11-ekte1238	v1.1.5
nmdc:wfmgan-11-05cdqw41.1	nmdc:MetagenomeAnnotation	nmdc:dobj-11-ykw2tv02	nmdc:dgns-11-ekte1238	v1.1.5

How does the taxonomic distribution of contigs differ by soil layer (mineral vs organic) in Colorado?¶

1. Get all biosamples where soil_horizon exists and the geo_loc_name has "Colorado" in the name¶

2. Get all Pooling results where the Pooling has_input are the biosample ids¶

3. Get Extraction records where processed_sample_id identifier is the has_input to the Extraction¶

4. Get the LibraryPreparation records¶

5. Get NucleotideSequencing records from the processed sample identifiers¶

6. Get the MetagenomeAnnotation ids using the DataGeneration identifiers¶

7. Get data objects from the metagenome activity result outputs¶

Clean up the combined results¶

Show how many results have M horizon vs. O horizon¶

Example of what the TSV contig taxa file looks like¶

Randomly select a subset of samples for which to download taxonomy files¶

Iterate throught the TSVs to get the contig taxa information¶

Clean up the relative abundance data to fill in NAs with 0 for unobserved taxa¶

Plot the average taxa abundance for all M and O horizon soil samples¶

Plot the taxa abundance of M and O horizon soil samples for each location¶

2. Get all Pooling results where the Pooling `has_input` are the biosample ids¶

3. Get `Extraction` records where `processed_sample_id` identifier is the `has_input` to the `Extraction`¶

4. Get the `LibraryPreparation` records¶

5. Get `NucleotideSequencing` records from the processed sample identifiers¶

6. Get the `MetagenomeAnnotation` ids using the `DataGeneration` identifiers¶