# Retrieve all data objects associated with this study
dobj <- get_data_objects_for_study("nmdc:sty-11-aygzgv51")

dobj <- dobj %>%
  
  # Remove uninformative columns for simpler dataframe
  select(-c(alternative_identifiers, in_manifest, was_generated_by)) %>% 

  # Filter to only desired file types
  filter(data_object_type %in% c("Annotation Enzyme Commission", "Annotation KEGG Orthology",
                                "GC-MS Metabolomics Results", "Protein Report"))

head(dobj)

# Obtain information about the workflow that produced each data object
workflow_info <- get_results_by_id(collection = "workflow_execution_set",
                                   match_id_field = "has_output",
                                   id_list = dobj$id,
                                   fields = "id,has_output,version") %>%
                    unnest(has_output)
head(workflow_info)

dobj <- dobj %>%
    left_join(workflow_info, by = join_by(id == has_output)) %>%

    # Filter to records from the desired workflow version
    filter(data_object_type == "Annotation KEGG Orthology" |
           data_object_type == "Annotation Enzyme Commission" |
           data_object_type == "GC-MS Metabolomics Results" |
           (data_object_type == "Protein Report" & version == "v1.2.1")) %>%
    
    # Filter to biosamples with all omics types results for the
    # specified workflow version(s)
    group_by(biosample_id) %>%
    filter("Annotation Enzyme Commission" %in% data_object_type &
            "Annotation KEGG Orthology" %in% data_object_type &
            "GC-MS Metabolomics Results" %in% data_object_type &
            "Protein Report" %in% data_object_type) %>%
    ungroup()
    

# How many biosamples have all three omics types?
paste("Number of biosamples:", length(unique(dobj$biosample_id)))

# How many data objects are there for each biosample? (should be 4)
stopifnot(dobj %>%
            group_by(biosample_id) %>%
            summarize(file_count = n()) %>%
            ungroup() %>%
            pull(file_count) == 4)

results_by_biosample <- dobj %>%
  
  # Create one URL column per type
  select(biosample_id, data_object_type, url) %>%
  pivot_wider(names_from = data_object_type, values_from = url) %>%
  
  # Read in the TSV/CSV results files
  # Add in column names from the IMG genome download README
  mutate(metag_ec_results = lapply(
    .$`Annotation Enzyme Commission`, 
    function(x) { 
      d <- read_tsv(x, col_names = FALSE, show_col_types = FALSE)
      names(d) <- c("gene_id", "img_ko_flag", "EC", "percent_identity",
                    "query_start", "query_end", "subj_start", "subj_end",
                    "evalue", "bit_score", "align_length")
      d
      })) %>%
  
  mutate(metag_ko_results = lapply(
    .$`Annotation KEGG Orthology`, 
    function(x) { 
      d <- read_tsv(x, col_names = FALSE, show_col_types = FALSE)
      names(d) <- c("gene_id", "img_ko_flag", "ko_term", "percent_identity",
                    "query_start", "query_end", "subj_start", "subj_end",
                    "evalue", "bit_score", "align_length")
      d
      })) %>%

  mutate(metap_results = lapply(.$`Protein Report`, read_tsv, col_names = TRUE, show_col_types = FALSE)) %>%
  mutate(metab_results = lapply(.$`GC-MS Metabolomics Results`, read_csv, col_names = TRUE, show_col_types = FALSE))

# View a snippet of the metagenome KEGG Orthology annotations file
head(results_by_biosample$metag_ko_results[[1]])

# Small wrapper function for commonly used function pair in this notebook
unlist_unique <- function (x) {
  unique(unlist(x, use.names = FALSE))
}


# Metagenome annotations - KEGG Orthology
# Save a unique vector of gene KO by sample
metag_ko_unique_list <- results_by_biosample$metag_ko_results %>%
  lapply(FUN = function(x) { sort(unique(x$ko_term)) }) %>%
  setNames(results_by_biosample$biosample_id)

# Save a unique vector of ALL gene KO for searching later
metag_ko_unique <- unlist_unique(metag_ko_unique_list)

# Metagenome annotations - Enzyme Commission
# Save a unique vector of gene EC by sample
metag_ec_unique_list <- results_by_biosample$metag_ec_results %>%
  lapply(FUN = function(x) { sort(unique(x$EC)) }) %>%
  setNames(results_by_biosample$biosample_id)

# Save a unique vector of ALL gene EC for searching later
metag_ec_unique <- unlist_unique(metag_ec_unique_list)


# Metaproteome annotations - KEGG Orthology
# Protein KO by sample
metap_ko_unique_list <- results_by_biosample$metap_results %>%
  lapply(FUN = function(x) { sort(unique(x$KO)) }) %>%
  setNames(results_by_biosample$biosample_id)

# ALL protein KO
metap_ko_unique <- unlist_unique(metap_ko_unique_list)

# Metaproteome annotations - Enzyme Commission
# Protein EC by sample
metap_ec_unique_list <- results_by_biosample$metap_results %>%
  # Protein report may have multiple EC annotations per line
  lapply(FUN = function(x) { x$EC_Number %>% strsplit(",") %>% unlist_unique() %>% sort() }) %>%
  setNames(results_by_biosample$biosample_id)

# ALL protein EC
metap_ec_unique <- unlist_unique(metap_ec_unique_list)

# Dataframe of metaproteome annotations across all samples
# Keep KO and EC results together
metap_combined_unique <- results_by_biosample$metap_results %>%
  bind_rows() %>%
  select(KO, EC_Number) %>%
  rename(ec_id = "EC_Number",
         ko_id = "KO") %>%
  # EC column may have multiple comma separated values
  # Reform dataframe to have one EC per row
  separate_wider_delim(ec_id, delim = ",", names_sep = ".", too_few = "align_start") %>%
  pivot_longer(cols = starts_with("ec_id"), names_to = NULL, values_to = "ec_id") %>%
  filter(if_any(everything(), ~ !is.na(.))) %>%
  # Filter out excess NA rows from pivot
  group_by(ko_id) %>%
  mutate(has_value = any(!is.na(ec_id))) %>%
  ungroup() %>%
  filter(!(has_value & is.na(ec_id))) %>%
  select(-has_value) %>%
  distinct()


# Metabolome annotations - KEGG Compound
# Compound IDs per sample
metab_co_unique_list <- results_by_biosample$metab_results %>%
  lapply(FUN = function(x) { sort(unique(x$`Kegg Compound ID`)) }) %>%
  setNames(results_by_biosample$biosample_id)

# ALL compound IDs
metab_co_unique <- unlist_unique(metab_co_unique_list)

# For each sample, check which metagenomic KO results also appear in the metaproteomic KO results
map2(.x = metag_ko_unique_list, .y = metap_ko_unique_list, 
     .f = ~if_else((.x %in% .y), "Has_Protein", "No_Protein")) %>%
  stack() %>%
  dplyr::rename(biosample_id = "ind", result = "values") %>%
  group_by(biosample_id, result) %>%
  summarize(count = n(), .groups = "drop_last") %>%
  ungroup() %>%
  filter(result == "Has_Protein") %>%
  arrange(-count) %>%
  mutate(biosample_id = factor(biosample_id, levels = biosample_id)) %>%

  # Plot the number of identified genes that have a corresponding protein identification by sample
  ggplot() +
    geom_bar(aes(x = biosample_id, y = count), stat = "identity", fill = "steelblue") +
    ggtitle("Metagenome identified genes with KO annotations in metaproteome results") +
    xlab("Biosample ID") + 
    ylab("Number of genes") +
    theme(axis.text.x = element_text(angle = 60, hjust = 1),
          legend.position = "none")

# Get all EC ids for each compound
ec_from_metabolites <- metab_co_unique %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("enzyme", x) }) %>%
  list_c()
  
# Format into a nested dataframe
ec_from_metabolites <- data.frame(compound_id = names(ec_from_metabolites),
                                  ec_id = ec_from_metabolites) %>%
  nest(.by = compound_id, .key = "ec_id")

# Get all module ids for each compound
modules_from_metabolites <- metab_co_unique %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("module", x) }) %>%
  list_c()
  
# Format into a nested dataframe
modules_from_metabolites <- data.frame(compound_id = names(modules_from_metabolites),
                                       module_id = modules_from_metabolites) %>%
  nest(.by = compound_id, .key = "module_id")


# Get all pathway ids for each compound
pathways_from_metabolites <- metab_co_unique %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("pathway", x) }) %>%
  list_c()
  
# Format into a nested dataframe
pathways_from_metabolites <- data.frame(compound_id = names(pathways_from_metabolites),
                                        pathway_id = pathways_from_metabolites) %>%
  nest(.by = compound_id, .key = "pathway_id")

# Assemble
metabolite_annotations <- data.frame(compound_id = paste0("cpd:", metab_co_unique),
                                     compound_trimmed = metab_co_unique) %>%
  left_join(ec_from_metabolites, by = join_by(compound_id)) %>%
  left_join(modules_from_metabolites, by = join_by(compound_id)) %>%
  left_join(pathways_from_metabolites, by = join_by(compound_id))

head(metabolite_annotations, 3)

# Pre-allocate an empty list for metabolite dataframes
metabolite_annotations_list <- vector(mode = "list", length = nrow(results_by_biosample))

for (biosample in 1:nrow(results_by_biosample)) {
  
  # For each sample, assemble a dataframe of metabolite information
  # Start with the unique list of metabolites for this sample
  metabolite_annotations_list[[biosample]] <- data.frame(compound_id = paste0("cpd:", metab_co_unique_list[[biosample]])) %>%
    # Filter join all metabolite annotations down to just the ones in this sample
    left_join(metabolite_annotations, by = join_by(compound_id))
}

names(metabolite_annotations_list) <- names(metab_co_unique_list)

# Cleanup intermediate objects
rm(metabolite_annotations, ec_from_metabolites, modules_from_metabolites, pathways_from_metabolites, metab_co_unique, metab_co_unique_list)

head(metabolite_annotations_list[[1]], 3)

# Use concatenated proteins vector, make all the calls once, then split up by biosample

# Get all module ids for each protein
modules_from_proteins <- metap_ko_unique %>%
  substring(4) %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("module", x) }) %>%
  list_c()
  
# Format into a nested dataframe
modules_from_proteins <- data.frame(ko_id = toupper(names(modules_from_proteins)),
                                    module_id = modules_from_proteins) %>%
  nest(.by = ko_id, .key = "module_id")


# Get all pathway ids for each protein
pathways_from_proteins <- metap_ko_unique %>%
  substring(4) %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("pathway", x) }) %>%
  list_c()
  
# Format into a nested dataframe
pathways_from_proteins <- data.frame(ko_id = toupper(names(pathways_from_proteins)),
                                     pathway_id = pathways_from_proteins) %>%
  nest(.by = ko_id, .key = "pathway_id")


# Assemble, starting from the dataframe of joined KO/EC annotations from the Protein Report
protein_annotations <- metap_combined_unique %>%
  left_join(modules_from_proteins, by = join_by(ko_id)) %>%
  left_join(pathways_from_proteins, by = join_by(ko_id))


# Pre-allocate an empty list for protein dataframes
protein_annotations_list <- vector(mode = "list", length = nrow(results_by_biosample))

for (biosample in 1:nrow(results_by_biosample)) {
  
  # For each sample, assemble a dataframe of protein information
  # Start with the unique list of proteins for this sample
  protein_annotations_list[[biosample]] <- data.frame(ko_id = metap_ko_unique_list[[biosample]]) %>%
    # Filter join all metabolite annotations down to just the ones in this sample
    left_join(protein_annotations, by = join_by(ko_id))
}

names(protein_annotations_list) <- names(metap_ko_unique_list)

# Cleanup intermediate objects
rm(protein_annotations, modules_from_proteins, pathways_from_proteins, metap_ec_unique_list, metap_ko_unique_list, metap_combined_unique, metap_ec_unique, metap_ko_unique)

head(protein_annotations_list[[1]])

# Use concatenated metag KO vector, make all the calls once, then split up by biosample

# Get all module ids for each gene
modules_from_genes <- metag_ko_unique %>%
  # API doesn't like the prefix in the annotated results, trim it off
  # eg. KO:K12345 becomes K12345
  substring(4) %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("module", x) }) %>%
  list_c()
  
# Format into a nested dataframe
modules_from_genes <- data.frame(ko_id = toupper(names(modules_from_genes)),
                                 module_id = modules_from_genes) %>%
  nest(.by = ko_id, .key = "module_id")


# Get all pathway ids for each gene
pathways_from_genes <- metag_ko_unique %>%
  substring(4) %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { Sys.sleep(0.5); keggLink("pathway", x) }) %>%
  list_c()
  
# Format into a nested dataframe
pathways_from_genes <- data.frame(ko_id = toupper(names(pathways_from_genes)),
                                  pathway_id = pathways_from_genes) %>%
  nest(.by = ko_id, .key = "pathway_id")


# Assemble
gene_annotations <- data.frame(ko_id = metag_ko_unique) %>%
  left_join(modules_from_genes, by = join_by(ko_id)) %>%
  left_join(pathways_from_genes, by = join_by(ko_id))

# Pre-allocate an empty list for gene dataframes
gene_annotations_list <- vector(mode = "list", length = nrow(results_by_biosample))

for (biosample in 1:nrow(results_by_biosample)) {
  
  # Add in the EC annotations here - it will be easier since they are in a separate file per sample
  
  # For each sample, assemble a dataframe of gene information

  # Start by pairing the KO and EC information from the annotation results files
  gene_annotations_list[[biosample]] <- results_by_biosample$metag_ko_results[[biosample]] %>%
    full_join(results_by_biosample$metag_ec_results[[biosample]],
              # Join by NMDC metag workflow assigned gene ID
              by = join_by(gene_id), relationship = "many-to-many") %>%
    select(ko_term, EC) %>%
    distinct() %>%
    rename(ko_id = "ko_term",
           ec_id = "EC") %>%
    
    # Filter to only KO identifications that also have EC information
    filter(!is.na(ec_id)) %>%
    
    # Now filter join all gene annotations down to just the ones in this sample
    left_join(gene_annotations, by = join_by(ko_id))
}

names(gene_annotations_list) <- names(metag_ko_unique_list)

# Cleanup intermediate objects
rm(gene_annotations, modules_from_genes, pathways_from_genes, metag_ko_unique, metag_ko_unique_list)

# Add "is this x in y" columns for all biomolecules

for (biosample in 1:nrow(results_by_biosample)) {
  
  metabolite_annotations_list[[biosample]] <- metabolite_annotations_list[[biosample]] %>%
    
    # The EC ID column in metabolite annotations is actually a list of character vectors,
    # since each compound (row) can have multiple EC annotations
    mutate(In_Metag_Annotations = vapply(.$ec_id, function(x) { any(x$ec_id %in% tolower(gene_annotations_list[[biosample]]$ec_id)) }, FUN.VALUE = TRUE),
           In_Metap_Annotations = vapply(.$ec_id, function(x) { any(x$ec_id %in% tolower(protein_annotations_list[[biosample]]$ec_id)) }, FUN.VALUE = TRUE))
  

  protein_annotations_list[[biosample]] <- protein_annotations_list[[biosample]] %>% 
    # tolower() is needed because the KEGG output has lowercase prefixes
    mutate(In_Metab_Annotations = tolower(ec_id) %in% unlist_unique(metabolite_annotations_list[[biosample]]$ec_id),
           In_Metag_Annotations = ko_id %in% gene_annotations_list[[biosample]]$ko_id)
  
  
  gene_annotations_list[[biosample]] <- gene_annotations_list[[biosample]] %>%
    mutate(In_Metab_Annotations = tolower(ec_id) %in% unlist_unique(metabolite_annotations_list[[biosample]]$ec_id),
           In_Metap_Annotations = ko_id %in% protein_annotations_list[[biosample]]$ko_id)
}

# View an example of what these dataframes look like for a single sample
head(protein_annotations_list[[1]])

# Select an example biosample
example_gene <- gene_annotations_list[[2]]
example_prot <- protein_annotations_list[[2]]
example_metab <- metabolite_annotations_list[[2]]

# Scale sectors so that arc widths are proportional rather than absolute for better visualization
largest_sector_xlim <- max(length(unique(example_gene$ko_id)),
                           length(unique(example_prot$ko_id)),
                           length(unique(example_metab$compound_id)))

# Gene sector
# Calculate size of sector
gene_sector_xlim <- length(unique(example_gene$ko_id))

# Sum the number of genes that show up in the protein annotations, then scale
gene_sector_prot_count <- (sum(distinct(example_gene, ko_id, In_Metap_Annotations)$In_Metap_Annotations)) * (largest_sector_xlim / gene_sector_xlim)
# Sum the number of genes that show up in protein AND metabolite annotations, then scale
gene_sector_both_count <- sum((example_gene %>% group_by(ko_id) %>% summarize(in_both = any(In_Metab_Annotations) & any(In_Metap_Annotations)) %>% ungroup())$in_both) * (largest_sector_xlim / gene_sector_xlim)
# Sum the number of genes that show up in metabolite annotations, then scale
gene_sector_metab_count <-(sum(distinct(example_gene, ko_id, In_Metab_Annotations)$In_Metab_Annotations)) * (largest_sector_xlim / gene_sector_xlim)

# Prot sector
prot_sector_xlim <- length(unique(example_prot$ko_id))

prot_sector_metab_count <- sum(distinct(example_prot, ko_id, In_Metab_Annotations)$In_Metab_Annotations) * (largest_sector_xlim / prot_sector_xlim)
prot_sector_both_count <- sum((example_prot %>% group_by(ko_id) %>% summarize(in_both = any(In_Metab_Annotations) & any(In_Metag_Annotations)) %>% ungroup())$in_both) * (largest_sector_xlim / prot_sector_xlim)
prot_sector_gene_count <- sum(distinct(example_prot, ko_id, In_Metag_Annotations)$In_Metag_Annotations) * (largest_sector_xlim / prot_sector_xlim)


# Metab sector
metab_sector_xlim <- length(unique(example_metab$compound_id))

metab_sector_gene_count <- sum(distinct(example_metab, compound_id, In_Metag_Annotations)$In_Metag_Annotations) * (largest_sector_xlim / metab_sector_xlim)
metab_sector_both_count <- sum((example_metab %>% group_by(compound_id) %>% summarize(in_both = any(In_Metap_Annotations) & any(In_Metag_Annotations)) %>% ungroup())$in_both) * (largest_sector_xlim / metab_sector_xlim)
metab_sector_prot_count <- sum(distinct(example_metab, compound_id, In_Metap_Annotations)$In_Metap_Annotations) * (largest_sector_xlim / metab_sector_xlim)

# Reset sector sizes to scaled value
gene_sector_xlim <- largest_sector_xlim
prot_sector_xlim <- largest_sector_xlim
metab_sector_xlim <- largest_sector_xlim

plot_sector_names <- c("genes", "proteins", "metabolites")
 
# Assemble data
data = data.frame(
    sectors = c(rep("genes", gene_sector_xlim),
               rep("proteins", prot_sector_xlim),
               rep("metabolites", metab_sector_xlim)),
    x = c(seq(1:gene_sector_xlim),
          seq(1:prot_sector_xlim),
          seq(1:metab_sector_xlim)),
    y = c(seq(1:gene_sector_xlim),
          seq(1:prot_sector_xlim),
          seq(1:metab_sector_xlim))) %>%
  mutate(sectors = factor(sectors, levels = plot_sector_names))

# Initialize the plot with the three specified sectors
circos.initialize(factors = data$sectors, x = data$x)

# Add outside labels
circos.labels(plot_sector_names, 
              x = c(gene_sector_xlim/2, prot_sector_xlim/2, metab_sector_xlim/2), 
              labels = c(paste0("genes (", nrow(example_gene), ")"),
                         paste0("proteins (", nrow(example_prot), ")"), 
                         paste0("metabolites (", nrow(example_metab), ")")),
              side = "outside")

 
# Build and color the three sectors in the outermost ring
circos.track(factors = data$sectors, y=data$y, bg.col = c("tomato", "gold", "skyblue2") , bg.border = NA)

# # Add labels in sector instead
# circos.trackText(sectors = plot_sector_names, 
#                  x = c(gene_sector_xlim/2, prot_sector_xlim/2, metab_sector_xlim/2),
#                  y = c(gene_sector_xlim/2, prot_sector_xlim/2, metab_sector_xlim/2),
#                  labels = c(paste0("genes (", nrow(example_gene), ")"),
#                          paste0("proteins (", nrow(example_prot), ")"), 
#                          paste0("metabolites (", nrow(example_metab), ")")), 
#                  facing = "bending.inside")
 
# Add links (aka arcs or chords) between sectors
# From the gene sector to the protein sector
circos.link(
      # Create a link that starts at the beginning of the gene sector
      # It is as wide as the scaled number of gene/protein shared annotations (see above)
      "genes", c(0, gene_sector_prot_count), 
      
      # The other end of the link is in the protein sector
      # It starts at the number of protein/metabolite but NOT gene shared annotations
      # It is as wide as the scaled number of gene/protein shared annotations
      "proteins", c(prot_sector_metab_count - prot_sector_both_count, 
                    prot_sector_metab_count - prot_sector_both_count + prot_sector_gene_count), 
      col = "#ff870099")

# Repeat for proteins/metab
circos.link("proteins", c(0, prot_sector_metab_count), 
            "metabolites", c(metab_sector_gene_count - metab_sector_both_count,
                             metab_sector_gene_count - metab_sector_both_count + metab_sector_prot_count), 
            col = "#63ff0099")


# Repeat for metab/genes
circos.link("metabolites", c(0, metab_sector_gene_count), 
            "genes", c(gene_sector_prot_count - gene_sector_both_count,
                       gene_sector_prot_count - gene_sector_both_count + gene_sector_metab_count), 
            col = "#8b24cd99")

Note: The first link end is drawn out of sector 'genes'.

Note: The second link end is drawn out of sector 'proteins'.

Note: The first link end is drawn out of sector 'proteins'.

Note: The first link end is drawn out of sector 'metabolites'.

pathways_by_biosample <- vector(mode = "list", length = nrow(results_by_biosample))

for (biosample in 1:length(pathways_by_biosample)) {

  # Get unique list of pathways from all identified biomolecules
  sample_pathways <- c(gene_annotations_list[[biosample]]$pathway_id, 
                       protein_annotations_list[[biosample]]$pathway_id,
                       metabolite_annotations_list[[biosample]]$pathway_id) %>%
    unlist(use.names = FALSE) %>%
    unique()
  
  # Look at basic pathway maps, exclude KO highlighted pathway maps
  sample_pathways <- sample_pathways[!str_detect(sample_pathways, "path:ko")]
  
  # Exclude "overview" pathway maps
  sample_pathways <- sample_pathways[!str_detect(sample_pathways, "path:map01")]
  
  pathways_by_biosample[[biosample]] <- sample_pathways
  
}

# Concatenate the pathways, make all the calls once, then join to pathways_by_biosample

# Get all compounds for each pathway
compounds_from_pathways <- pathways_by_biosample %>%
  unlist(use.names = FALSE) %>%
  unique() %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { keggLink("compound", x) }) %>%
  list_c()
  
# Format into a nested dataframe
compounds_from_pathways <- data.frame(pathway_id = names(compounds_from_pathways),
                                      compound_id = compounds_from_pathways) %>%
  nest(.by = pathway_id, .key = "compound_id")


# Get all KO for each pathway
ko_from_pathways <- pathways_by_biosample %>%
  unlist(use.names = FALSE) %>%
  unique() %>%
  split(ceiling(seq_along(.)/50)) %>%
  lapply(FUN = function(x) { keggLink("ko", x) }) %>%
  list_c()

# Format into a nested dataframe
ko_from_pathways <- data.frame(pathway_id = names(ko_from_pathways),
                               ko_id = ko_from_pathways) %>%
  nest(.by = pathway_id, .key = "ko_id")


# Count the elements that comprise each pathway for calculations later
all_pathway_elements <- full_join(compounds_from_pathways, ko_from_pathways, 
                                  by = join_by(pathway_id)) %>%
  mutate(compound_count = vapply(compound_id, 
                                 FUN = function(x) { ifelse(is.null(x), 0, nrow(x)) }, 
                                 FUN.VALUE = 1)) %>%
  mutate(ko_count =       vapply(ko_id, 
                                 FUN = function(x) { ifelse(is.null(x), 0, nrow(x)) }, 
                                 FUN.VALUE = 1)) %>%
  mutate(total_elements = compound_count + ko_count) %>%
  # Filter to pathways with at least one of each
  filter(compound_count > 0 & ko_count > 0)

pathway_coverage_by_biosample <- vector(mode = "list", length = nrow(results_by_biosample))

for (biosample in 1:length(pathways_by_biosample)) {
  
  all_ko_detected <- unique(c(gene_annotations_list[[biosample]]$ko_id,
                              protein_annotations_list[[biosample]]$ko_id))
  
  pathway_coverage_by_biosample[[biosample]] <- all_pathway_elements %>%
    filter(pathway_id %in% pathways_by_biosample[[biosample]]) %>%
    mutate(compound_found    = vapply(.$compound_id, 
                                      function(x) { sum(x$compound_id %in% metabolite_annotations_list[[biosample]]$compound_id) }, 
                                      FUN.VALUE = 1),
           ko_found          = vapply(.$ko_id, 
                                      function(x) { sum(toupper(x$ko_id) %in% all_ko_detected) }, 
                                      FUN.VALUE = 1),
           compound_coverage = compound_found / compound_count,
           ko_coverage       = ko_found / ko_count,
           total_coverage    = (compound_found + ko_found)/total_elements) %>%
    
    select(-c(compound_id, ko_id)) %>%
    mutate_all(~ifelse(is.nan(.), NA, .)) %>%
    
    # Filter out pathways where we only found 1 element
    filter(compound_found + ko_found > 1) %>%
    arrange(-total_coverage)
}

names(pathway_coverage_by_biosample) <- names(gene_annotations_list)

# View an example of pathway coverage df for one sample
head(pathway_coverage_by_biosample[[1]])

# Pull biosample metadata and make sure samples are in the same order as data going into plots
ordered_metadata <- get_results_by_id(collection = "biosample_set",
                    match_id_field = "id",
                    id_list = results_by_biosample$biosample_id,
                    fields = "id,samp_name,description,depth.has_numeric_value,lat_lon") %>% 
                    unnest(cols = c(depth, lat_lon), names_sep = ".") %>%
                    right_join(data.frame(id = names(pathway_coverage_by_biosample)), by = join_by(id))

head(ordered_metadata)

# Set up color vector for side grouping column
sample_color_vec <- case_when(
  str_detect(ordered_metadata$lat_lon.has_raw_value, "46\\.372") ~ "orange",
  str_detect(ordered_metadata$lat_lon.has_raw_value, "46\\.373") ~ "blue")

options(repr.plot.width = 7, repr.plot.height = 5, repr.plot.res = 250)
par(xpd = TRUE)

pathway_coverage_by_biosample %>%
  bind_rows(.id = "biosample") %>%
  select(biosample, pathway_id, ko_coverage) %>%
  pivot_wider(names_from = "pathway_id", values_from = "ko_coverage") %>%
  replace(is.na(.), 0) %>%
  column_to_rownames(var = "biosample") %>%
  as.matrix() %>%
  
  heatmap(Colv = NA, RowSideColors = sample_color_vec,
          labRow = FALSE, labCol = FALSE, scale = "none",
          main = "KEGG pathway coverage across biosamples (KO terms only)",
          xlab = "KEGG Pathways",
          ylab = "Biosamples", 
          keep.dendro = FALSE, na.rm= TRUE,
          col = hcl.colors(50))

par(cex = 0.8)

legend(x = "bottomright", legend = c("Low coverage", "High coverage"), fill = c("#440154", "#FDE725"), inset = c(0, -0.2))

legend(x = "bottomleft", legend = c("North location", "South location"), fill = c("blue", "orange"), inset = c(0, -0.2))

options(repr.plot.width = 7, repr.plot.height = 5, repr.plot.res = 250)
par(xpd = TRUE)

pathway_coverage_by_biosample %>%
  bind_rows(.id = "biosample") %>%
  select(biosample, pathway_id, compound_coverage) %>%
  pivot_wider(names_from = "pathway_id", values_from = "compound_coverage") %>%
  replace(is.na(.), 0) %>%
  column_to_rownames(var = "biosample") %>%
  as.matrix() %>%
  
  heatmap(Colv = NA, RowSideColors = sample_color_vec,
          labRow = FALSE, labCol = FALSE, scale = "none",
          main = "KEGG pathway coverage across biosamples (Compounds only)",
          xlab = "KEGG Pathways",
          ylab = "Biosamples", 
          keep.dendro = FALSE, na.rm= TRUE,
          col = hcl.colors(50))

par(cex = 0.8)

legend(x = "bottomright", legend = c("Low coverage", "High coverage"), fill = c("#440154", "#FDE725"), inset = c(0, -0.2))

legend(x = "bottomleft", legend = c("North location", "South location"), fill = c("blue", "orange"), inset = c(0, -0.2))

pathway_coverage_by_biosample %>%
  bind_rows(.id = "biosample") %>%
  select(biosample, pathway_id, total_coverage) %>%
  group_by(pathway_id) %>%
  filter(any(total_coverage > 0.5)) %>%
  mutate(average_coverage = mean(total_coverage)) %>%
  ungroup() %>%
  arrange(-average_coverage) %>%
  pull(pathway_id) %>%
  unique()

# Create better labelling for plot
top_pathways <- data.frame(
  pathway_id = c("path:map00571", "path:map00710", "path:map00020", "path:map00220", "path:map00770",
                 "path:map00250", "path:map00260", "path:map04122", "path:map00010"),
  pathway_category = c("1.7 Glycan biosynthesis and metabolism", "1.2 Energy metabolism",
                       "1.1 Carbohydrate metabolism", "1.5 Amino acid metabolism", 
                       "1.8 Metabolism of cofactors and vitamins", "1.5 Amino acid metabolism",
                       "1.5 Amino acid metabolism", "2.3 Folding, sorting, and degradation",
                       "1.1 Carbohydrate metabolism"),
  pathway_name = c("Lipoarabinomannan (LAM) biosynthesis", "Carbon fixation by Calvin cycle", 
                   "Citrate cycle (TCA cycle)", "Arginine biosynthesis", "Pantothenate and CoA biosynthesis",
                   "Alanine, aspartate and glutamate metabolism", "Glycine, serine and threonine metabolism",
                   "Sulfur relay system", "Gluconeogenesis"))


pathway_coverage_by_biosample %>%
  bind_rows(.id = "biosample") %>%
  select(biosample, pathway_id, total_coverage) %>%
  group_by(pathway_id) %>%
  filter(any(total_coverage > 0.5)) %>%
  mutate(average_coverage = mean(total_coverage)) %>%
  ungroup() %>%
  arrange(-average_coverage) %>%
  left_join(top_pathways, by = join_by(pathway_id)) %>%
  mutate(pathway_name = factor(pathway_name, levels = unique(pathway_name))) %>%
  
  ggplot() +
    geom_violin(aes(x = pathway_name, y = total_coverage, fill = pathway_category)) +
    geom_jitter(aes(x = pathway_name, y = total_coverage), shape = ".") +
    labs(fill = "Pathway Category") +
    theme(axis.text.x = element_text(angle = 40, vjust = 1, hjust=1),
          axis.title.x = element_blank(),
          legend.position = "left",
          legend.text = element_text(size = 8))

# Select a pathway ID
pathway_detail_id <- "path:map00710"

# Create a vector of KO and compound IDs for the selected pathway, using the query results from earlier
pathway_detail_elements <- c(
  all_pathway_elements[which(all_pathway_elements$pathway_id == pathway_detail_id), 2],
  all_pathway_elements[which(all_pathway_elements$pathway_id == pathway_detail_id), 3]) %>% 
  unlist(use.names = FALSE)

# Pull out pathway elements for each sample
pathway_detail_identifications <- vector(mode = "list", length = nrow(results_by_biosample))

for (biosample in 1:length(pathway_detail_identifications)) {
  
  all_detections <- unique(c(gene_annotations_list[[biosample]]$ko_id,
                             protein_annotations_list[[biosample]]$ko_id,
                             metabolite_annotations_list[[biosample]]$compound_id))
  
  pathway_detail_identifications[[biosample]] <- data.frame(
    biomolecules = all_detections[toupper(all_detections) %in% toupper(pathway_detail_elements)])
}

# Count how many samples each pathway biomolecule appeared in
pathway_detail_identifications <- pathway_detail_identifications %>%
  bind_rows(.id = "biosample") %>%
  group_by(biomolecules) %>%
  summarize(sample_frequency = n()) %>% 
  ungroup() %>%
  # Calculate proportion of samples where the molecule was identified
  mutate(sample_frequency = sample_frequency/length(pathway_detail_identifications)) %>%
  # Clean up biomolecule ID format for pathview (below)
  mutate(biomolecules = str_match(biomolecules, ":(.+)$")[,2])

pathway_detail_identifications <- pull(pathway_detail_identifications) %>%
  setNames(pathway_detail_identifications$biomolecules)

print(pathway_detail_identifications)

    K00024     K00026     K00029     K00134     K00150     K00615     K00616 
1.00000000 0.06451613 1.00000000 1.00000000 0.96774194 1.00000000 1.00000000 
    K00814     K00855     K00927     K01006     K01086     K01595     K01601 
0.03225806 0.90322581 1.00000000 1.00000000 0.29032258 1.00000000 1.00000000 
    K01602     K01610     K01623     K01624     K01783     K01803     K01807 
0.90322581 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 1.00000000 
    K01808     K02446     K03841     K04041     K11532     K11645     K13810 
0.96774194 1.00000000 1.00000000 0.48387097 0.93548387 1.00000000 1.00000000 
    C00022     C00036     C00041     C00049     C00074     C00085     C00199 
0.09677419 0.74193548 0.64516129 0.70967742 0.58064516 0.38709677 0.48387097 
    C00279     C00354 
1.00000000 0.77419355

if(Sys.getenv("COLAB_BACKEND_VERSION") != "") download.file(url = "https://github.com/microbiomedata/nmdc_notebooks/blob/main/omics_types_integration/R/ko00710.pathview.png")

# Save a colored KEGG pathway map to the working folder
# pathview(
#   gene.data = pathway_detail_identifications[which(str_detect(names(pathway_detail_identifications), "K"))],
#   cpd.data = pathway_detail_identifications[which(str_detect(names(pathway_detail_identifications), "C"))],
#   pathway.id = str_match(pathway_detail_id, ":map(.+)$")[,2],
#   species = "ko",
#   kegg.native = TRUE,
#   same.layer = TRUE,
#   limit = list(gene = c(0, 1), cpd = c(0, 1)),
#   bins = 14,
#   low=list(gene="cadetblue"),
#   mid = list(gene = "goldenrod"),
#   high=list(gene="palevioletred"),
#   key.align = "y"
# )

Value	Description
Annotation Enzyme Commission	Tab delimited file for EC annotation
Annotation KEGG Orthology	Tab delimited file for KO annotation
GC-MS Metabolomics Results	GC-MS-based metabolite assignment results table
Protein Report	Filtered protein report file

biosample_id	id	type	name	description	file_size_bytes	md5_checksum	data_object_type	url	data_category
<chr>	<chr>	<chr>	<chr>	<chr>	<dbl>	<chr>	<chr>	<chr>	<chr>
nmdc:bsm-13-0jw5n594	nmdc:dobj-11-g091y451	nmdc:DataObject	nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_Protein_Report.tsv	Aggregated protein lists from MSGF+ search results filtered to ~5% FDR	50342	7cb4c883bf3387bb8ee5e85167220074	Protein Report	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/2/nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_Protein_Report.tsv	processed_data
nmdc:bsm-13-0jw5n594	nmdc:dobj-11-yzphz319	nmdc:DataObject	nmdc_wfmgan-11-bnzfrb88.1_ec.tsv	EC Annotations for nmdc:wfmgan-11-bnzfrb88.1	6398472	2650ba479f1f80e1360dedbfee2f55c0	Annotation Enzyme Commission	https://data.microbiomedata.org/data/nmdc:omprc-13-e4rebn56/nmdc:wfmgan-11-bnzfrb88.1/nmdc_wfmgan-11-bnzfrb88.1_ec.tsv	processed_data
nmdc:bsm-13-0jw5n594	nmdc:dobj-11-9h6wwc88	nmdc:DataObject	nmdc_dobj-11-4kwfhk83_kaiko_Protein_Report.tsv	Aggregated protein lists from MSGF+ search results filtered to ~5% FDR	1363837	61e01f76d3b6c2308ee07d909cba0a06	Protein Report	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-4kwfhk83_kaiko_Protein_Report.tsv	processed_data
nmdc:bsm-13-0jw5n594	nmdc:dobj-13-bdjqzn54	nmdc:DataObject	Froze_Core_2015_N1_0_10_13_Metab.csv	MetaMS GC-MS metabolomics output detail CSV file	460644	1c9cb77275d4977bc171faad2aef7e47	GC-MS Metabolomics Results	https://nmdcdemo.emsl.pnnl.gov/metabolomics/results/Froze_Core_2015_N1_0_10_13_Metab.csv	processed_data
nmdc:bsm-13-0jw5n594	nmdc:dobj-11-wr2qfy21	nmdc:DataObject	nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_Protein_Report.tsv	Aggregated protein lists from MSGF+ search results filtered to ~5% FDR	14504	f85ee1a8ed1816d6633c7e7571cb873e	Protein Report	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_Protein_Report.tsv	processed_data
nmdc:bsm-13-0jw5n594	nmdc:dobj-11-3a4xgj72	nmdc:DataObject	nmdc_wfmgan-11-bnzfrb88.1_ko.tsv	KEGG Orthology for nmdc:wfmgan-11-bnzfrb88.1	9537499	5a879fb02d7195672e7fbe8e502a3aee	Annotation KEGG Orthology	https://data.microbiomedata.org/data/nmdc:omprc-13-e4rebn56/nmdc:wfmgan-11-bnzfrb88.1/nmdc_wfmgan-11-bnzfrb88.1_ko.tsv	processed_data

id	has_output	version
<chr>	<chr>	<chr>
nmdc:wfmp-11-mwnsey20.1	nmdc:dobj-11-cf9tq368	v2.0.0
nmdc:wfmp-11-mwnsey20.1	nmdc:dobj-11-2amcs255	v2.0.0
nmdc:wfmp-11-mwnsey20.1	nmdc:dobj-11-0aynzc71	v2.0.0
nmdc:wfmp-11-mwnsey20.1	nmdc:dobj-11-r3rzfh15	v2.0.0
nmdc:wfmp-11-mwnsey20.1	nmdc:dobj-11-pf2rkv95	v2.0.0
nmdc:wfmp-11-mwnsey20.1	nmdc:dobj-11-9gr5va56	v2.0.0

gene_id	img_ko_flag	ko_term	percent_identity	query_start	query_end	subj_start	subj_end	evalue	bit_score	align_length
<chr>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
nmdc:wfmgan-11-bnzfrb88.1_000001_5389_7344	2794162845	KO:K03336	82.21	2	651	8	656	0.0e+00	1210.0	650
nmdc:wfmgan-11-bnzfrb88.1_000001_7341_8171	2527268703	KO:K02510	78.12	13	268	11	265	7.4e-139	453.0	256
nmdc:wfmgan-11-bnzfrb88.1_000001_12306_12872	2995783332	KO:K00010	78.31	1	189	5	193	5.0e-104	345.0	189
nmdc:wfmgan-11-bnzfrb88.1_000002_7256_7507	2831771083	KO:K05570	63.86	1	83	1	83	1.3e-30	125.0	83
nmdc:wfmgan-11-bnzfrb88.1_000002_7504_7836	2523088826	KO:K05571	50.60	6	88	4	85	6.8e-17	88.5	83
nmdc:wfmgan-11-bnzfrb88.1_000002_7836_8093	2522225331	KO:K14113	69.86	2	74	5	77	3.4e-28	118.0	73

	compound_id	compound_trimmed	ec_id	module_id	pathway_id
	<chr>	<chr>	<list>	<list>	<list>
1	cpd:C00009	C00009	c("ec:1.....	c("md:M0....	c("path:....
2	cpd:C00013	C00013	c("ec:1.....		c("path:....
3	cpd:C00016	C00016	c("ec:1.....	c("md:M0....	c("path:....

Identifying relationships between annotated omics data in NMDC¶

1. Retrieve data from the NMDC database using API endpoints¶

Choose data to retrieve¶

Retrieve and filter data for Columbia River sediment study¶

Choose results by workflow version¶

Download selected results files¶

Extract unique identifications¶

2. Use annotations in file output to link identifications¶

3. Get IDs from other KEGG databases¶

Gather metabolite information¶

Gather protein information¶

Gather gene information¶

Identify linked IDs between biosamples¶

4. Visualize shared biomolecule identifications¶

5. Calculate coverage of KEGG pathways across samples¶

6. Visualize KEGG pathway coverage across samples¶

Pull biosample metadata¶

Create pathway coverage heatmaps¶

Highest coverage pathways¶

7. Investigate an example KEGG pathway¶

Licensing Information¶

Package Citations¶

A data.frame: 6 × 6
	ko_id	ec_id	module_id	pathway_id	In_Metab_Annotations	In_Metag_Annotations
	<chr>	<chr>	<list>	<list>	<lgl>	<lgl>
1	KO:K01607	EC:4.1.1.44		c("path:....	FALSE	TRUE
2	KO:K01992	NA			FALSE	FALSE
3	KO:K01999	NA		c("path:....	FALSE	FALSE
4	KO:K02058	NA			FALSE	FALSE
5	KO:K03070	EC:7.4.2.8		c("path:....	FALSE	TRUE
6	KO:K03622	NA			FALSE	FALSE

A tibble: 6 × 9
pathway_id	compound_count	ko_count	total_elements	compound_found	ko_found	compound_coverage	ko_coverage	total_coverage
<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
path:map00571	2	14	16	0	11	0.0000000	0.7857143	0.6875000
path:map00710	21	37	58	6	25	0.2857143	0.6756757	0.5344828
path:map04122	11	29	40	3	18	0.2727273	0.6206897	0.5250000
path:map00220	24	67	91	8	39	0.3333333	0.5820896	0.5164835
path:map00770	30	45	75	8	30	0.2666667	0.6666667	0.5066667
path:map00260	50	110	160	11	70	0.2200000	0.6363636	0.5062500

A tibble: 6 × 8
id	description	depth.has_numeric_value	lat_lon.has_raw_value	lat_lon.latitude	lat_lon.longitude	lat_lon.type	samp_name
<chr>	<chr>	<dbl>	<chr>	<dbl>	<dbl>	<chr>	<chr>
nmdc:bsm-13-0jw5n594	Riverbed sediment samples were collected from an area with no vegetation in a hyporheic zone (subsurface groundwater - surface water mixing zone)	0.0	46.37380143 -119.2723688	46.3738	-119.2724	nmdc:GeolocationValue	GW-RW N1_0_10
nmdc:bsm-13-13145k83	Riverbed sediment samples were collected from an area with no vegetation in a hyporheic zone (subsurface groundwater - surface water mixing zone)	0.2	46.37379502 -119.2722385	46.3738	-119.2722	nmdc:GeolocationValue	GW-RW N3_20_30
nmdc:bsm-13-1p0tct86	Riverbed sediment samples were collected from an area with no vegetation in a hyporheic zone (subsurface groundwater - surface water mixing zone)	0.2	46.37380143 -119.2723688	46.3738	-119.2724	nmdc:GeolocationValue	GW-RW N1_20_30
nmdc:bsm-13-2fw5j754	Riverbed sediment samples were collected from an area with no vegetation in a hyporheic zone (subsurface groundwater - surface water mixing zone)	0.4	46.37379502 -119.2722385	46.3738	-119.2722	nmdc:GeolocationValue	GW-RW N3_40_50
nmdc:bsm-13-2yga6857	Riverbed sediment samples were collected from an area with no vegetation in a hyporheic zone (subsurface groundwater - surface water mixing zone)	0.1	46.37379502 -119.2722385	46.3738	-119.2722	nmdc:GeolocationValue	GW-RW N3_10_20
nmdc:bsm-13-3dt9xs73	Riverbed sediment samples were collected from an area with no vegetation in a hyporheic zone (subsurface groundwater - surface water mixing zone)	0.5	46.37379502 -119.2722385	46.3738	-119.2722	nmdc:GeolocationValue	GW-RW N3_50_60