# Setup 
# Add renv project library to R environment variable libPaths()
.libPaths(c(.libPaths(), "../../renv/library/*/R-*/*"))

# Load required packages
suppressPackageStartupMessages({
library(dplyr, warn.conflicts = FALSE)
library(tidyr, warn.conflicts = FALSE)
library(stringr, warn.conflicts = FALSE)
library(readr, warn.conflicts = FALSE)
library(ggplot2, warn.conflicts = FALSE)
library(fs)
library(digest)
library(httr)
library(stats)
library(purrr)
library(tibble)
library(jsonlite)
library(KEGGREST)
library(curl)
})

options(repr.plot.width = 12, repr.plot.height = 8)

# Set this to TRUE when you want to recompute even if the cached_data exists
FORCE_RUN <- as.logical(Sys.getenv("FORCE_RUN", unset = "FALSE"))
SAMPLING_SIZE <- Sys.getenv("SAMPLING_SIZE", unset = -1) # convention: None/0/<0 means "read all"
CACHE_DIR <- Sys.getenv("CACHE_DIR", unset = "../cached_data")

# Load NMDC API functions from this repo
if(Sys.getenv("COLAB_BACKEND_VERSION") == "") source("../../utility_functions.R")

if(Sys.getenv("COLAB_BACKEND_VERSION") != "") source("http://raw.githubusercontent.com/microbiomedata/nmdc_notebooks/refs/heads/main/utility_functions.R")

# Retrieve all data objects associated with this study
dobj <- get_data_objects_for_study("nmdc:sty-11-8ws97026")

# 1) Filter to biosamples that have ALL three required data_object_type entries
dobj_filtered <- dobj %>%
  group_by(biosample_id) %>%
  filter(all(c("Annotation KEGG Orthology",
               "Metatranscriptome Expression",
               "Gene Phylogeny tsv") %in% data_object_type)) %>%
  ungroup()

# 2) Keep only the desired file types and select/rename columns
results_by_biosample <- dobj_filtered %>%
  filter(data_object_type %in% c("Annotation KEGG Orthology",
                                 "Metatranscriptome Expression",
                                 "Gene Phylogeny tsv")) %>%
  transmute(
    biosample_id,
    output_id = id,          # rename 'id' -> 'output_id'
    data_object_type,
    url
  )

# 3) Check counts per data_object_type (equivalent to pandas value_counts)
type_counts <- results_by_biosample %>%
  count(data_object_type, name = "n")

# Peek
head(results_by_biosample)
type_counts

# --- Flag MetaT / MetaG by URL ---
results_by_biosample <- results_by_biosample %>%
  mutate(
    `_has_metaT` = grepl("nmdc:wfmt", url, ignore.case = TRUE),
    `_has_metaG` = grepl("nmdc:wfmg", url, ignore.case = TRUE),
    `_is_annotation` = data_object_type == "Annotation KEGG Orthology",
    `_is_lineage`    = data_object_type == "Gene Phylogeny tsv"
  ) %>%
  # Relabel data_object_type based on flags (use masks computed above)
  mutate(
    data_object_type = case_when(
      `_has_metaT` & `_is_annotation` ~ "MetaT Annotation KEGG Orthology",
      `_has_metaG` & `_is_annotation` ~ "MetaG Annotation KEGG Orthology",
      `_has_metaT` & `_is_lineage`    ~ "MetaT Gene Phylogeny tsv",
      `_has_metaG` & `_is_lineage`    ~ "MetaG Gene Phylogeny tsv",
      TRUE ~ data_object_type
    )
  )

# --- Per-biosample aggregates ---
agg <- results_by_biosample %>%
  group_by(biosample_id) %>%
  summarise(
    rows_per_biosample = n(),
    has_metaT = any(`_has_metaT`, na.rm = TRUE),
    has_metaG = any(`_has_metaG`, na.rm = TRUE),
    metaT_row_count = sum(`_has_metaT`, na.rm = TRUE),
    metaG_row_count = sum(`_has_metaG`, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(
    category = case_when(
      has_metaT & has_metaG ~ "both_metaT_and_metaG",
      has_metaT & !has_metaG ~ "metaT_only",
      has_metaG & !has_metaT ~ "metaG_only",
      TRUE ~ "neither"
    )
  )

# --- Category counts (like value_counts) ---
counts <- agg %>%
  count(category, name = "count") %>%
  arrange(desc(count))

print(counts)

# --- Keep only biosamples with BOTH MetaT and MetaG ---
keep_ids <- agg %>%
  filter(category == "both_metaT_and_metaG") %>%
  pull(biosample_id) %>%
  unique()

drop_ids <- agg %>%
  filter(category != "both_metaT_and_metaG") %>%
  pull(biosample_id) %>%
  unique()

filtered_results_by_biosample <- results_by_biosample %>%
  filter(biosample_id %in% keep_ids) %>%
  select(-`_has_metaT`, -`_has_metaG`, -`_is_annotation`, -`_is_lineage`)

# --- Wide table: one row per biosample, columns = data_object_type, values = url ---
wide <- filtered_results_by_biosample %>%
  select(biosample_id, data_object_type, url) %>%
  pivot_wider(names_from = data_object_type, values_from = url)

head(wide, 10)

# A tibble: 2 × 2
  category             count
  <chr>                <int>
1 both_metaT_and_metaG    25
2 metaT_only              14

# -- Helper: turn length-0 into NA
scalar_or_na <- function(x) if (length(x) == 0) NA else x

# -- Helper: extract has_raw_value; keep multi-valued as list-col
extract_value <- function(value) {
  # If it's a list with has_raw_value, return that scalar
  if (is.list(value) && "has_raw_value" %in% names(value)) {
    return(scalar_or_na(value$has_raw_value))
  }
  # If it's an atomic vector of length > 1, wrap as list-column
  if (is.atomic(value) && length(value) > 1) {
    return(list(value))
  }
  # If it's a complex/named list without has_raw_value, keep as list-col
  if (is.list(value) && length(value) > 1) {
    return(list(value))
  }
  # Fallback: scalar or length-0 -> NA
  scalar_or_na(value)
}

# -- Optional: collapse list-cols that are atomic vectors into a single string
collapse_multi_vals <- function(x, sep = "; ") {
  if (is.list(x) && length(x) == 1 && is.atomic(x[[1]]) && length(x[[1]]) > 1) {
    return(paste(x[[1]], collapse = sep))
  }
  x
}

# --- Main: flatten biosample records into a tibble
# Set collapse_multi = TRUE if you want plain strings instead of list-cols
biosample_records_to_table <- function(biosample_records, collapse_multi = TRUE, sep = "; ") {
  rows <- map(biosample_records, function(data) {
    if (is.null(data)) return(NULL)
    flat <- map(data, extract_value)

    if (collapse_multi) {
      flat <- lapply(flat, collapse_multi_vals, sep = sep)
    }

    as_tibble(flat, .name_repair = "unique")
  })

  df <- bind_rows(rows) %>%
    rename(biosample_id = id)

  df
}


biosample_ids <- unique(wide$biosample_id)
biosample_records <- map(biosample_ids, get_biosample_record_by_id)

# Keep multi-valued fields as list-columns (safe for further processing)
biosample_metadata <- biosample_records_to_table(biosample_records)

# Save
write_csv(biosample_metadata, "biosample_metadata.csv")

# Reuse your domain normalizer
# Vectorized URL normalizer (handles character, list-cols, NA/empty, "nan")
normalize_nmdc_url <- function(url) {
  # If list-column, take first element per cell (or NA)
  if (is.list(url)) {
    url <- vapply(
      url,
      function(x) if (length(x) == 0) NA_character_ else as.character(x[[1]]),
      character(1)
    )
  }
  url <- as.character(url)

  # Normalize empties / sentinel strings to NA
  bad <- is.na(url) | !nzchar(url) | tolower(url) %in% c("nan", "na", "null", "none")
  url[bad] <- NA_character_

  # Rewrite domain
  #url <- str_replace(url,
  #                   "^https://data\\.microbiomedata\\.org",
  #                   "https://data-backup.microbiomedata.org")
  url
}

# Build a *lightweight* datastore index: just IDs + URLs (no data loaded)
build_datastore_index <- function(wide_df) {
  stopifnot(all(c("biosample_id") %in% names(wide_df)))

  # These column names must match your wide table’s columns
  cols_needed <- c(
    "MetaG Annotation KEGG Orthology",
    "MetaT Annotation KEGG Orthology",
    "MetaG Gene Phylogeny tsv",
    "MetaT Gene Phylogeny tsv",
    "Metatranscriptome Expression"
  )
  missing <- setdiff(cols_needed, names(wide_df))
  if (length(missing)) stop("Missing columns in `wide_df`: ", paste(missing, collapse = ", "))

  wide_df %>%
    transmute(
      biosample_id,
      metag_ko_url      = normalize_nmdc_url(`MetaG Annotation KEGG Orthology`),
      metat_ko_url      = normalize_nmdc_url(`MetaT Annotation KEGG Orthology`),
      metag_lineage_url = normalize_nmdc_url(`MetaG Gene Phylogeny tsv`),
      metat_lineage_url = normalize_nmdc_url(`MetaT Gene Phylogeny tsv`),
      metat_expr_url    = normalize_nmdc_url(`Metatranscriptome Expression`)
    )
}
# Column specs
ko_columns <- c(
  "gene_id","img_ko_flag","ko_term","percent_identity",
  "query_start","query_end","subj_start","subj_end",
  "evalue","bit_score","align_length"
)
lineage_columns <- c("gene_id","homolog_gene_oid","homolog_taxon_oid","percent_identity","lineage")

parse_subsample_size <- function(x) {
  if (is.null(x) || is.na(x)) return(Inf)
  x <- tolower(trimws(as.character(x)))
  if (x %in% c("", "full", "all", "inf", "infinite")) return(Inf)
  n <- suppressWarnings(as.integer(x))
  if (is.na(n) || n <= 0) return(Inf)
  n
}

# With cache: save to cache_dir/<hash>.tsv then read locally
read_tsv_cached <- function(url, column_names = NULL, cache_dir = "cached_data", subsample_size = "full",
                            max_retries = 3, sleep_seconds = 2) {
  if (is.na(url) || is.null(url) || url == "" || identical(tolower(url), "nan")) return(NULL)

  dir_create(cache_dir)
  key <- digest(url, algo = "xxhash64")
  local_path <- path(cache_dir, paste0(key, ".tsv"))
  n_lines <- parse_subsample_size(subsample_size)

  col_names_arg <- if (is.null(column_names)) TRUE else column_names
  # Compute n_max (rows) to approximate "N lines"
  n_max_arg <- Inf
  if (is.finite(n_lines)) {
    if (is.null(column_names)) {
      n_max_arg <- max(n_lines - 1, 0)  # Correct: skip header line
    } else {
     n_max_arg <- n_lines              # WRONG: read_tsv() may still skip header
    }
  }

  # If cached, just read
  if (file_exists(local_path)) {
    message(sprintf("Using cached file: %s", local_path))
    return(tryCatch({
      readr::read_tsv(local_path, col_names = col_names_arg, n_max = n_max_arg, show_col_types = FALSE, progress = FALSE)
    }, error = function(e) { warning("Failed to read cached: ", local_path, ": ", e$message); NULL }))
  }

  # Else, try download+read (retry)
  attempt <- 1
  while (attempt <= max_retries) {
    tryCatch({
      message(sprintf("Downloading: %s (attempt %d)", url, attempt))

      table <- readr::read_tsv(url, col_names = col_names_arg, n_max = n_max_arg, show_col_types = FALSE, progress = FALSE)
      # Save to temp then move into cache (atomic)
      tmp <- tempfile(fileext = ".tsv")
      if (!is.null(column_names) && length(column_names) > 0) {
        # Strip headers before caching for no-header files
        write_tsv(table, tmp, col_names = FALSE)
      } else {
        write_tsv(table, tmp)
      }
      file_move(tmp, local_path)
      return(table)
    },
    error = function(e) {
      warning(sprintf("Attempt %d failed for %s: %s", attempt, url, e$message))
      if (attempt < max_retries) Sys.sleep(sleep_seconds * attempt)
    })
    attempt <- attempt + 1
  }

  warning("❌ Failed after retries: ", url)
  NULL
}

# Accessors (read only what’s asked for)
load_ko_table <- function(url) {
  read_tsv_cached(url, cache_dir = "cached", column_names = ko_columns , subsample_size = SAMPLING_SIZE)
}
load_lineage_table <- function(url) {
  read_tsv_cached(url, cache_dir = CACHE_DIR, column_names = lineage_columns, subsample_size = SAMPLING_SIZE)
}
load_expr_table <- function(url) {
  read_tsv_cached(url, cache_dir = CACHE_DIR, column_names = NULL, subsample_size = SAMPLING_SIZE)  # header inferred
}

#   Build the lightweight index from your wide table
#    (wide has biosample_id + the 5 columns from the pivot step)
datastore_index <- build_datastore_index(wide)
# Check total number of biosamples & preview one file ---
cat("# of biosamples: ", length(datastore_index$biosample_id), "\n")

# of biosamples:  25

# Extract KO id like "K#####"
extract_ko_id <- function(x) {
  if (!is.character(x)) return(NA_character_)
  m <- str_extract(toupper(x), "K\\d{5}")
  m
}

# Standardize a KO dataframe
standardize_ko_df <- function(df_ko) {
  # Return an empty tibble if NULL or 0 rows
  if (is.null(df_ko) || nrow(df_ko) == 0) {
    print("KO dataframe is NULL or empty; returning empty tibble.")
    return(tibble(gene_id = character(), ko_id = character(), ko_term = character()))
  }
  needed <- c("gene_id", "ko_term")
  missing_cols <- setdiff(needed, names(df_ko))
  if (length(missing_cols) > 0) {
    stop(sprintf("KO file missing required column(s): %s.", paste(missing_cols, collapse = ", ")))
  }
  
  out <- df_ko %>%
    mutate(
      ko_id = vapply(ko_term, extract_ko_id, FUN.VALUE = character(1)),
      gene_id = as.character(.data$gene_id)
    ) %>%
    filter(!is.na(ko_id)) %>%
    select(gene_id, ko_id, ko_term) %>%
    distinct()

  out
}

# Returns:
# - ko_overlap_df: per-biosample overlap stats
# - total_ko_ids:  deduped vector of all KO IDs seen (MetaG+MetaT)
# - ko_sets:       named list[biosample_id] -> list(metag = <KO vec>, metat = <KO vec>)
compute_ko_overlap_streaming <- function(datastore_index, include_metat = TRUE) {
  stopifnot("biosample_id" %in% names(datastore_index),
            "metag_ko_url" %in% names(datastore_index))

  rows <- vector("list", nrow(datastore_index))
  ko_accum <- character(0)
  ko_sets  <- vector("list", nrow(datastore_index))

  for (i in seq_len(nrow(datastore_index))) {
    biosample_id <- as.character(datastore_index$biosample_id[i])
    print(sprintf("Processing biosample %s (%d of %d)", biosample_id, i, nrow(datastore_index)))
    # ---- load MetaG ----
    mg <- tryCatch({
      standardize_ko_df(load_ko_table(datastore_index$metag_ko_url[i]))
    }, error = function(e) tibble(gene_id=character(), ko_id=character(), ko_term=character()))

    mg_ids <- unique(na.omit(mg$ko_id))
    mg_ids <- mg_ids[grepl("^K\\d{5}$", mg_ids)]

    # ---- load MetaT (optional) ----
    mt_ids <- character(0)
    if (isTRUE(include_metat) && "metat_ko_url" %in% names(datastore_index)) {
      mt <- tryCatch({
        standardize_ko_df(load_ko_table(datastore_index$metat_ko_url[i]))
      }, error = function(e) tibble(gene_id=character(), ko_id=character(), ko_term=character()))
      mt_ids <- unique(na.omit(mt$ko_id))
      mt_ids <- mt_ids[grepl("^K\\d{5}$", mt_ids)]
    }

    # ---- per-sample stats ----
    mg_set <- mg_ids
    mt_set <- mt_ids
    shared <- intersect(mg_set, mt_set)
    uni    <- union(mg_set, mt_set)

    rows[[i]] <- tibble(
      biosample_id   = biosample_id,
      n_metag_ko     = length(mg_set),
      n_metat_ko     = length(mt_set),
      n_shared_ko    = length(shared),
      n_metag_only   = length(setdiff(mg_set, mt_set)),
      n_metat_only   = length(setdiff(mt_set, mg_set)),
      jaccard        = if (length(uni) > 0) length(shared) / length(uni) else NA_real_
    )

    # ---- stash KO sets for reuse ----
    ko_sets[[i]] <- list(metag = mg_set, metat = mt_set)
    names(ko_sets)[i] <- biosample_id

    # ---- accumulate for global KO list ----
    ko_accum <- c(ko_accum, mg_set, mt_set)
  }

  ko_overlap_df <- dplyr::bind_rows(rows)
  total_ko_ids  <- sort(unique(ko_accum))
  total_ko_ids  <- total_ko_ids[grepl("^K\\d{5}$", total_ko_ids)]

  list(
    ko_overlap_df = ko_overlap_df,
    total_ko_ids  = total_ko_ids,
    ko_sets       = ko_sets
  )
}

ko_overlap_df_path <- file.path(CACHE_DIR, "ko_overlap_summary.csv")
# -------- Load data (prefer CSV if present) --------
if (file.exists(ko_overlap_df_path) && !isTRUE(FORCE_RUN)) {
  ko_overlap_df <- readr::read_csv(ko_overlap_df_path, show_col_types = FALSE)
  message("Loaded existing KO overlap summary from: ", ko_overlap_df_path)
} else {
    
  # Run and save
  stream_res <- compute_ko_overlap_streaming(datastore_index)
  ko_overlap_df <- stream_res$ko_overlap_df
  total_ko_ids  <- stream_res$total_ko_ids
  ko_sets       <- stream_res$ko_sets   # <-- per-biosample KO sets
  write_csv(ko_overlap_df, file.path(CACHE_DIR, "ko_overlap_summary.csv"))

  # Show a peek
  print(head(ko_overlap_df, 20))
  message("Saved: ", file.path(CACHE_DIR, "ko_overlap_summary.csv"))
}

Loaded existing KO overlap summary from: ../cached_data/ko_overlap_summary.csv

html_path  <- file.path(CACHE_DIR, "plot_shared_ko_barchart.html")
ko_overlap_df_path <- file.path(CACHE_DIR, "ko_overlap_summary.csv")
# -------- Load data (prefer CSV if present) --------
if (file.exists(ko_overlap_df_path)) {
  ko_overlap_df <- readr::read_csv(ko_overlap_df_path, show_col_types = FALSE)
} else if (exists("ko_overlap_df")) {
  message("Using in-memory ko_overlap_df (CSV not found).")
} else {
  stop("ko_overlap_summary.csv not found and ko_overlap_df not in memory.")
}

# Nothing to plot?
if (nrow(ko_overlap_df) == 0) {
  stop("ko_overlap_df is empty; nothing to plot.")
}

# ---- Reshape to long format ----
df_long <- ko_overlap_df %>%
  pivot_longer(
    cols = c(n_metag_only, n_metat_only, n_shared_ko),
    names_to = "category",
    values_to = "count"
  ) %>%
  mutate(
    category = factor(
      category,
      levels = c("n_metag_only", "n_metat_only", "n_shared_ko"),
      labels = c("MetaG only", "MetaT only", "Shared KO")
    )
  )

# Optional: order biosamples by total count (for better visual sorting)
sample_order <- df_long %>%
  group_by(biosample_id) %>%
  summarise(total = sum(count, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total)) %>%
  pull(biosample_id)

#df_long <- df_long %>%
#  mutate(biosample_id = factor(biosample_id, levels = sample_order))

# ---- Define color map (same as Python: blue, orange, green) ----
cat_colors <- c("MetaG only" = "blue", "MetaT only" = "orange", "Shared KO" = "green")

# ---- Plot with ggplot2 ----
p <- ggplot(df_long, aes(x = biosample_id, y = count, fill = category)) +
  geom_bar(stat = "identity", position = "stack") +
  scale_fill_manual(values = cat_colors, name = "Category") +
  labs(
    title = "Shared vs Unique KO annotations per biosample",
    x = "Biosample ID",
    y = "Count"
  ) +
  theme_bw(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
    plot.title = element_text(hjust = 0.5)
  )

# ---- Display in R session ----
print(p)

# Join metadata (left join on biosample_id)
ko_with_meta <- ko_overlap_df %>%
  left_join(
    biosample_metadata %>% select(biosample_id, env_local_scale),
    by = "biosample_id"
  )

# ---------------------------
# 1) Violin plot (with box + points): n_shared_ko by ecosystem
# ---------------------------
if (nrow(ko_with_meta) > 0) {
  # order ecosystems by median shared KO (optional, for nicer layout)
  eco_order <- ko_with_meta %>%
    group_by(env_local_scale) %>%
    summarise(med = median(n_shared_ko, na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(med)) %>%
    pull(env_local_scale)

  ko_with_meta <- ko_with_meta %>%
    mutate(env_local_scale = factor(env_local_scale, levels = eco_order))

  p_violin <- ggplot(ko_with_meta, aes(x = env_local_scale, y = n_shared_ko)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.15, outlier.shape = NA) +
    geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
    labs(
      title = "Shared KO counts across ecosystems",
      x = "Ecosystem (env_local_scale)",
      y = "Shared KO count"
    ) +
    theme_bw(base_size = 12) +
    theme(
      axis.text.x = element_text(size=14),
      plot.title = element_text(hjust = 0.5)
    )

  print(p_violin)
}

# ---------------------------
# 2) Stacked bars of mean overlap metrics per ecosystem
# ---------------------------
eco_summary <- ko_with_meta %>%
  group_by(env_local_scale) %>%
  summarise(
    n_shared_ko  = mean(n_shared_ko,  na.rm = TRUE),
    n_metag_only = mean(n_metag_only, na.rm = TRUE),
    n_metat_only = mean(n_metat_only, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(
    cols = c(n_shared_ko, n_metag_only, n_metat_only),
    names_to = "metric",
    values_to = "value"
  ) %>%
  mutate(
    metric = factor(
      metric,
      levels = c("n_shared_ko", "n_metag_only", "n_metat_only"),
      labels = c("Shared KO", "MetaG only", "MetaT only")
    ),
    env_local_scale = factor(env_local_scale, levels = eco_order)
  )

if (nrow(eco_summary) > 0) {
  color_map <- c("Shared KO" = "green", "MetaG only" = "blue", "MetaT only" = "orange")

  p_bar <- ggplot(eco_summary, aes(x = env_local_scale, y = value, fill = metric)) +
    geom_col(position = "stack") +
    scale_fill_manual(values = color_map, name = "Metric") +
    labs(
      title = "Average KO overlap (MetaG vs MetaT) per ecosystem",
      x = "Ecosystem (env_local_scale)",
      y = "Average count"
    ) +
    theme_bw(base_size = 12) +
    theme(
      axis.text.x = element_text(size = 14),
      plot.title = element_text(hjust = 0.5)
    )

  print(p_bar)
}

RUN_KEGG_MAPPING <- TRUE

# --- Helpers ---
.ensure_cache <- function(dir) {
  if (!dir.exists(dir)) dir.create(dir, recursive = TRUE)
  invisible(dir)
}
.strip_prefix <- function(x, db) sub(paste0("^", db, ":"), "", x)

# ============== 1) KO -> Pathway links (cache + KEGGREST) ==============
# Returns tibble: ko_id, pathway_id (e.g., "K00001", "map00010")
kegg_link_ko_to_pathways <- function(ko_ids,
                                     cache_dir = CACHE_DIR,
                                     run_mapping = RUN_KEGG_MAPPING,
                                     chunk_size = 50) {
  # sanitize
  ko_ids <- sort(unique(ko_ids[grepl("^K\\d{5}$", ko_ids)]))
  if (length(ko_ids) == 0) {
    return(tibble(ko_id = character(), pathway_id = character()))
  }

  .ensure_cache(cache_dir)
  cache_path <- file.path(cache_dir, "kegg_ko_pathway_map.tsv")
  cached <- if (file.exists(cache_path)) {
    suppressMessages(readr::read_tsv(cache_path, col_types = "cc"))
  } else {
    tibble(ko_id = character(), pathway_id = character())
  }

  # which KOs not in cache at all?
  cached_kos <- unique(cached$ko_id)
  missing <- setdiff(ko_ids, cached_kos)

  new_rows <- tibble(ko_id = character(), pathway_id = character())

  if (run_mapping && length(missing) > 0) {
    # KEGGREST supports vectorized calls; chunk to be gentle
    chunks <- split(missing, ceiling(seq_along(missing) / chunk_size))
    for (vec in chunks) {
      # Build "ko:" IDs
      ko_q <- paste0("ko:", vec)
      # keggLink("pathway", c("ko:K00001","ko:K00002",...))
      res <- tryCatch(KEGGREST::keggLink("pathway", ko_q),
                      error = function(e) { message("[KEGG] link error: ", e$message); character(0) })

      if (length(res)) {
        # res is a named character vector: values are "path:map#####", names are "ko:K#####"
        add <- tibble(
          ko_id      = .strip_prefix(names(res), "ko"),
          pathway_id = .strip_prefix(unname(res), "path")
        ) %>% distinct()
        new_rows <- bind_rows(new_rows, add)
      } else {
        # no mappings returned for this chunk; still add missing as 0-row (do nothing)
        NULL
      }
    }

    if (nrow(new_rows)) {
      cached <- bind_rows(cached, new_rows) %>% distinct()
      readr::write_tsv(cached, cache_path)
    }
  }

  # Return only the rows for requested ko_ids
  cached %>%
    filter(ko_id %in% ko_ids) %>%
    distinct()
}

# ============== 2) Pathway ID -> Pathway name (cache + KEGGREST) ==============
# Returns tibble: pathway_id (map#####), pathway_name (string)
kegg_pathway_names <- function(pathway_ids,
                               cache_dir = CACHE_DIR,
                               run_mapping = RUN_KEGG_MAPPING,
                               chunk_size = 50) {
  # sanitize
  pathway_ids <- sort(unique(pathway_ids[grepl("^map\\d{5}$", pathway_ids)]))
  if (length(pathway_ids) == 0) {
    return(tibble(pathway_id = character(), pathway_name = character()))
  }

  .ensure_cache(cache_dir)
  cache_path <- file.path(cache_dir, "kegg_pathway_names.tsv")
  cached <- if (file.exists(cache_path)) {
    suppressMessages(readr::read_tsv(cache_path, col_types = "cc"))
  } else {
    tibble(pathway_id = character(), pathway_name = character())
  }
  cached_dict <- cached %>% distinct(pathway_id, pathway_name)

  missing <- setdiff(pathway_ids, cached_dict$pathway_id)
  new_rows <- tibble(pathway_id = character(), pathway_name = character())

  if (run_mapping && length(missing) > 0) {
    chunks <- split(missing, ceiling(seq_along(missing) / chunk_size))
    for (vec in chunks) {
      # KEGG ids must be "path:map00010" etc.
      path_q <- paste0("path:", vec)
      # keggList returns a named character vector: names=pathway IDs, values=names
      lst <- tryCatch(KEGGREST::keggList(path_q),
                      error = function(e) { message("[KEGG] list error: ", e$message); character(0) })
      if (length(lst)) {
        add <- tibble(
          pathway_id   = .strip_prefix(names(lst), "path"),
          pathway_name = unname(lst)
        ) %>% distinct()
        new_rows <- bind_rows(new_rows, add)
      }
    }

    if (nrow(new_rows)) {
      cached <- bind_rows(cached, new_rows) %>% distinct()
      readr::write_tsv(cached, cache_path)
    }
  }

  cached %>%
    filter(pathway_id %in% pathway_ids) %>%
    distinct()
}

# ============== 3) Build KO -> Pathway(+name) map from your datastore ==============
build_ko_to_pathway_mapping_from_kos <- function(ko_ids,
                                                 cache_dir = CACHE_DIR,
                                                 run_mapping = RUN_KEGG_MAPPING) {
  ko_ids <- unique(na.omit(ko_ids))
  ko_ids <- ko_ids[grepl("^K\\d{5}$", ko_ids)]
  if (length(ko_ids) == 0) {
    return(tibble(ko_id = character(), pathway_id = character(), pathway_name = character()))
  }

  ko2path <- kegg_link_ko_to_pathways(ko_ids,
                                      cache_dir   = cache_dir,
                                      run_mapping = run_mapping)

  path_names <- if (nrow(ko2path)) {
    kegg_pathway_names(unique(ko2path$pathway_id),
                       cache_dir   = cache_dir,
                       run_mapping = run_mapping)
  } else {
    tibble(pathway_id = character(), pathway_name = character())
  }

  ko2path %>%
    left_join(path_names, by = "pathway_id") %>%
    distinct()
}
# ===================== 4) Wrapper to use CSV cache ======================

get_or_build_ko_path_map <- function(ko_ids,
                                     cache_dir = CACHE_DIR,
                                     csv_name = "ko_path_map.csv",
                                     run_mapping = RUN_KEGG_MAPPING,
                                     FORCE_RUN = FALSE) {

  .ensure_cache(cache_dir)
  csv_path <- file.path(cache_dir, csv_name)

  # --- CASE 1: CSV already exists → skip computation ---
  if (file.exists(csv_path) && !isTRUE(FORCE_RUN)) {
    message("✔ Found cached KO→pathway map: ", csv_path)
    df <- readr::read_csv(csv_path, show_col_types = FALSE)
    return(df)
  }

  # --- CASE 2: CSV NOT found → run computation ---
  message("⚠ No cached KO→pathway map found. Running KEGG mapping...")

  ko_path_map <- build_ko_to_pathway_mapping_from_kos(
    ko_ids      = ko_ids,
    cache_dir   = cache_dir,
    run_mapping = run_mapping
  )

  # Save to CSV
  message("💾 Saving KO→pathway map to: ", csv_path)
  readr::write_csv(ko_path_map, csv_path)

  return(ko_path_map)
}

ko_path_map <- get_or_build_ko_path_map(
  ko_ids      = total_ko_ids,
  cache_dir   = CACHE_DIR,
  csv_name    = "ko_path_map.csv",
  run_mapping = RUN_KEGG_MAPPING,
  FORCE_RUN   = FORCE_RUN
)

head(ko_path_map, 10)

✔ Found cached KO→pathway map: ../cached_data/ko_path_map.csv

# ---- 1) Parse MetaT expression (unchanged helper) ----
parse_expression_df <- function(df_expr) {
  gene_col <- "img_gene_oid"
  expr_col <- "reads_cnt"
  OUT_gene_col <- "gene_id"
  OUT_expr_col <- "expr_value"

  if (is.null(df_expr) || nrow(df_expr) == 0) {
    return(tibble(!!OUT_gene_col := character(), !!OUT_expr_col := numeric()))
  }
  stopifnot(all(c(gene_col, expr_col) %in% names(df_expr)))

  df_expr %>%
    select(all_of(c(gene_col, expr_col))) %>%
    rename(!!OUT_gene_col := all_of(gene_col),
           !!OUT_expr_col := all_of(expr_col)) %>%
    mutate(
      !!OUT_expr_col := suppressWarnings(as.numeric(.data[[OUT_expr_col]])),
      !!OUT_gene_col := as.character(.data[[OUT_gene_col]])
    ) %>%
    filter(!is.na(.data[[OUT_gene_col]])) %>%
    group_by(.data[[OUT_gene_col]]) %>%
    summarise(!!OUT_expr_col := sum(.data[[OUT_expr_col]], na.rm = TRUE),
              .groups = "drop")
}

# ---- 1b) New: parse expression directly from URL via on-demand loader ----
# Requires you already defined: load_expr_table(url)  (cached+retry)
parse_expression_from_url <- function(expr_url) {
  df_raw <- load_expr_table(expr_url)
  parse_expression_df(df_raw)
}

# ---- 2) Compute pathway expression + coverage per biosample (on-demand loaders) ----
# datastore_index: needs biosample_id, metag_ko_url, metat_ko_url, metat_expr_url
# ko_path_map:     tibble(ko_id, pathway_id, pathway_name)
# ko_sets:         named list[biosample_id] -> list(metag=<vec>, metat=<vec>)  (from compute_ko_overlap_streaming)
# load_metat_ko_for_expression:
#   - TRUE  => will load MetaT KO table (for gene->ko mapping) to compute expression-by-pathway
#   - FALSE => skip expression aggregation (presence metrics only; no KO table loads)
compute_pathway_expression_coverage_on_demand <- function(datastore_index,
                                                          ko_path_map,
                                                          ko_sets = NULL,
                                                          load_metat_ko_for_expression = TRUE) {
  rows_expr <- list()
  rows_presence <- list()

  # Pre-split pathway map for speed
  kp_split <- if (!is.null(ko_path_map) && nrow(ko_path_map) > 0) {
    split(ko_path_map, ko_path_map$pathway_id)
  } else list()

  if (length(kp_split) == 0) {
    return(list(
      pathway_expr = tibble(biosample_id = character(), pathway_id = character(),
                            pathway_name = character(), expr_sum = numeric()),
      pathway_presence = tibble(biosample_id = character(), pathway_id = character(),
                                pathway_name = character(), n_metag_kos = integer(),
                                n_metat_kos = integer(), n_shared_kos = integer())
    ))
  }

  for (i in seq_len(nrow(datastore_index))) {
    biosample_id   <- as.character(datastore_index$biosample_id[i])
    metat_expr_url <- datastore_index$metat_expr_url[i]

    # ----- Get KO sets without loading tables -----
    if (!is.null(ko_sets) && !is.null(ko_sets[[biosample_id]])) {
      mg_kos <- unique(na.omit(ko_sets[[biosample_id]]$metag))
      mt_kos <- unique(na.omit(ko_sets[[biosample_id]]$metat))
    } else {
      # Fallback: load KO tables (old behavior)
      metag_ko_url <- datastore_index$metag_ko_url[i]
      metat_ko_url <- datastore_index$metat_ko_url[i]
      metag <- standardize_ko_df(load_ko_table(metag_ko_url))
      metat <- standardize_ko_df(load_ko_table(metat_ko_url))
      mg_kos <- unique(na.omit(metag$ko_id))
      mt_kos <- unique(na.omit(metat$ko_id))
    }
    sh_kos <- intersect(mg_kos, mt_kos)

    # ----- Presence counts per pathway (uses KO sets only) -----
    pres_tbl <- purrr::map_dfr(names(kp_split), function(pid) {
      sub <- kp_split[[pid]]
      p_name <- if ("pathway_name" %in% names(sub) && any(!is.na(sub$pathway_name))) {
        sub$pathway_name[which(!is.na(sub$pathway_name))[1]]
      } else pid
      path_kos <- unique(na.omit(sub$ko_id))
      tibble(
        biosample_id = biosample_id,
        pathway_id   = pid,
        pathway_name = p_name,
        n_metag_kos  = length(intersect(path_kos, mg_kos)),
        n_metat_kos  = length(intersect(path_kos, mt_kos)),
        n_shared_kos = length(intersect(path_kos, sh_kos))
      )
    })
    rows_presence[[length(rows_presence) + 1L]] <- pres_tbl

    # ----- Expression per pathway (MetaT only) -----
    if (isTRUE(load_metat_ko_for_expression)) {
      # We still need gene->KO mapping to aggregate expression by KO
      metat_ko_url <- datastore_index$metat_ko_url[i]
      metat <- standardize_ko_df(load_ko_table(metat_ko_url))
      expr  <- parse_expression_from_url(metat_expr_url)

      if (nrow(metat) > 0 && nrow(expr) > 0) {
        metat_expr <- metat %>%
          select(gene_id, ko_id, ko_term) %>%
          left_join(expr, by = "gene_id")

        if (nrow(metat_expr) > 0) {
          metat_expr_by_ko <- metat_expr %>%
            group_by(ko_id) %>%
            summarise(expr_value = sum(expr_value, na.rm = TRUE), .groups = "drop")

          joined <- metat_expr_by_ko %>%
            left_join(ko_path_map %>% select(ko_id, pathway_id, pathway_name) %>% distinct(),
                      by = "ko_id")

          grouped <- joined %>%
            group_by(pathway_id, pathway_name) %>%
            summarise(expr_value = sum(expr_value, na.rm = TRUE), .groups = "drop") %>%
            mutate(
              biosample_id = biosample_id,
              expr_sum     = as.numeric(expr_value)
            ) %>%
            select(biosample_id, pathway_id, pathway_name, expr_sum)

          if (nrow(grouped) > 0) rows_expr[[length(rows_expr) + 1L]] <- grouped
        }
      }
    }
  }

  pathway_expr <- if (length(rows_expr)) bind_rows(rows_expr) else
    tibble(biosample_id = character(), pathway_id = character(), pathway_name = character(), expr_sum = numeric())

  pathway_presence <- if (length(rows_presence)) bind_rows(rows_presence) else
    tibble(biosample_id = character(), pathway_id = character(), pathway_name = character(),
           n_metag_kos = integer(), n_metat_kos = integer(), n_shared_kos = integer())

  list(pathway_expr = pathway_expr, pathway_presence = pathway_presence)
}

if (file.exists(file.path(CACHE_DIR, "pathway_expression_by_sample.csv")) && !isTRUE(FORCE_RUN)) {
  pathway_expr <- readr::read_csv(file.path(CACHE_DIR, "pathway_expression_by_sample.csv"), show_col_types = FALSE)
  pathway_presence <- readr::read_csv(file.path(CACHE_DIR, "pathway_presence_by_sample.csv"), show_col_types = FALSE)
  message("Loaded existing pathway expression and presence data from: ", file.path(CACHE_DIR, "pathway_expression_by_sample.csv"))
} else {
  res <- compute_pathway_expression_coverage_on_demand(datastore_index, ko_path_map, ko_sets = ko_sets)
  pathway_expr     <- res$pathway_expr
  pathway_presence <- res$pathway_presence

  write_csv(pathway_expr,     file.path(CACHE_DIR, "pathway_expression_by_sample.csv"))
  write_csv(pathway_presence, file.path(CACHE_DIR, "pathway_presence_by_sample.csv"))

}

# Quick peek
print(head(pathway_expr, 10))
print(head(pathway_presence, 10))

Loaded existing pathway expression and presence data from: ../cached_data/pathway_expression_by_sample.csv

# A tibble: 10 × 4
   biosample_id         pathway_id pathway_name                         expr_sum
   <chr>                <chr>      <chr>                                   <dbl>
 1 nmdc:bsm-11-042nd237 map00010   Glycolysis / Gluconeogenesis [PATH:…   327421
 2 nmdc:bsm-11-042nd237 map00020   Citrate cycle (TCA cycle) [PATH:ko0…   245707
 3 nmdc:bsm-11-042nd237 map00030   Pentose phosphate pathway [PATH:ko0…   181284
 4 nmdc:bsm-11-042nd237 map00040   Pentose and glucuronate interconver…    52294
 5 nmdc:bsm-11-042nd237 map00051   Fructose and mannose metabolism [PA…    84861
 6 nmdc:bsm-11-042nd237 map00052   Galactose metabolism [PATH:ko00052]     47456
 7 nmdc:bsm-11-042nd237 map00053   Ascorbate and aldarate metabolism […    57276
 8 nmdc:bsm-11-042nd237 map00061   Fatty acid biosynthesis [PATH:ko000…   150937
 9 nmdc:bsm-11-042nd237 map00062   Fatty acid elongation [PATH:ko00062]     1820
10 nmdc:bsm-11-042nd237 map00071   Fatty acid degradation [PATH:ko0007…   157872

# A tibble: 10 × 6
   biosample_id     pathway_id pathway_name n_metag_kos n_metat_kos n_shared_kos
   <chr>            <chr>      <chr>              <dbl>       <dbl>        <dbl>
 1 nmdc:bsm-11-042… map00010   Glycolysis …          80          79           75
 2 nmdc:bsm-11-042… map00020   Citrate cyc…          54          55           53
 3 nmdc:bsm-11-042… map00030   Pentose pho…          62          60           58
 4 nmdc:bsm-11-042… map00040   Pentose and…          73          65           64
 5 nmdc:bsm-11-042… map00051   Fructose an…          85          76           74
 6 nmdc:bsm-11-042… map00052   Galactose m…          42          40           33
 7 nmdc:bsm-11-042… map00053   Ascorbate a…          40          40           37
 8 nmdc:bsm-11-042… map00061   Fatty acid …          33          32           32
 9 nmdc:bsm-11-042… map00062   Fatty acid …           9          15            9
10 nmdc:bsm-11-042… map00071   Fatty acid …          35          42           33

# Merge metadata
f_expr     <- file.path(CACHE_DIR, "pathway_expression_by_sample.csv")
f_pres     <- file.path(CACHE_DIR, "pathway_presence_by_sample.csv")
f_out      <- file.path(CACHE_DIR, "pathway_coverage_by_ecosystem.csv")

# --- Load pathway_expr / pathway_presence (prefer files if present) ---
if (file.exists(f_expr)) {
  pathway_expr <- readr::read_csv(f_expr, show_col_types = FALSE)
} else if (!exists("pathway_expr")) {
  stop("pathway_expression_by_sample.csv not found and object `pathway_expr` not in memory.")
}

if (file.exists(f_pres)) {
  pathway_presence <- readr::read_csv(f_pres, show_col_types = FALSE)
} else if (!exists("pathway_presence")) {
  stop("pathway_presence_by_sample.csv not found and object `pathway_presence` not in memory.")
}

# --- Merge metadata (needs biosample_metadata with env_local_scale) ---
stopifnot(exists("biosample_metadata"))
stopifnot(all(c("biosample_id", "env_local_scale") %in% names(biosample_metadata)))

pathway_presence <- pathway_presence %>%
  left_join(
    biosample_metadata %>% select(biosample_id, env_local_scale),
    by = "biosample_id"
  )

# --- Compute modality-specific counts: MetaG-only & MetaT-only (clip at 0) ---
# n_metag_only = n_metag_kos - n_shared_kos; n_metat_only = n_metat_kos - n_shared_kos
pathway_presence <- pathway_presence %>%
  mutate(
    n_metag_only = pmax(0L, coalesce(n_metag_kos, 0L) - coalesce(n_shared_kos, 0L)),
    n_metat_only = pmax(0L, coalesce(n_metat_kos, 0L) - coalesce(n_shared_kos, 0L))
  )

# --- Aggregate by ecosystem × pathway (mean) ---
eco_pathway <- pathway_presence %>%
  group_by(env_local_scale, pathway_name) %>%
  summarise(
    n_metag_only = mean(n_metag_only, na.rm = TRUE),
    n_metat_only = mean(n_metat_only, na.rm = TRUE),
    n_shared_kos = mean(n_shared_kos, na.rm = TRUE),
    .groups = "drop"
  )

# --- Save & preview ---
readr::write_csv(eco_pathway, f_out)
print(head(eco_pathway))

# A tibble: 6 × 5
  env_local_scale            pathway_name n_metag_only n_metat_only n_shared_kos
  <chr>                      <chr>               <dbl>        <dbl>        <dbl>
1 mineral horizon [ENVO:036… ABC transpo…        82.8         9.55        253.  
2 mineral horizon [ENVO:036… AGE-RAGE si…         1.18        8.27          9.55
3 mineral horizon [ENVO:036… AMPK signal…         2.36       13.3          22.1 
4 mineral horizon [ENVO:036… ATP-depende…         6.82       11.5          17.4 
5 mineral horizon [ENVO:036… Acarbose an…         5.55        0.455         5.36
6 mineral horizon [ENVO:036… Acute myelo…         0           4.91          3.55

## Now visualize pathway coverage across ecosystems by category

# -------- 1) Long format (mimics DataFrame.melt) --------
dfm <- eco_pathway %>%
  pivot_longer(
    cols = c(n_metag_only, n_metat_only, n_shared_kos),
    names_to = "category",
    values_to = "mean_count"
  )

# -------- 2) Top 20 pathways by total mean_count across ecosystems --------
top_paths <- dfm %>%
  group_by(pathway_name) %>%
  summarise(total = sum(mean_count, na.rm = TRUE), .groups = "drop") %>%
  arrange(desc(total)) %>%
  slice_head(n = 20) %>%
  pull(pathway_name)

dfm_top <- dfm %>%
  filter(pathway_name %in% top_paths) %>%
  mutate(
    # order pathways by the global total so facets share the same x-order
    pathway_name = factor(pathway_name, levels = top_paths),
    # match the color mapping from your Python code
    category = factor(
      category,
      levels = c("n_metag_only", "n_metat_only", "n_shared_kos"),
      labels = c("MetaG only", "MetaT only", "Shared KO")
    )
  )

# -------- 3) Colors (blue, orange, green) --------
cat_colors <- c("MetaG only" = "blue", "MetaT only" = "orange", "Shared KO" = "green")

# -------- 4) Plot (stacked bars, faceted by ecosystem) --------
p <- ggplot(dfm_top, aes(x = pathway_name, y = mean_count, fill = category)) +
  geom_col(position = "stack") +
  facet_wrap(~ env_local_scale, nrow = 1, scales = "free_y") +
  scale_fill_manual(values = cat_colors, name = "Category") +
  labs(
    title = "Average KO counts per pathway by ecosystem (MetaG-only, MetaT-only, Shared)",
    x = "Pathway",
    y = "Average KO count"
  ) +
  theme_bw(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
    plot.title = element_text(hjust = 0.5),
    panel.spacing = unit(0.8, "lines")
  )

# -------- 5) Display --------
# Adjust plot size for better visibility in R notebooks
options(repr.plot.width = 16, repr.plot.height = 8)

print(p)

cat_labels <- c(
  "n_shared_kos" = "Shared KO counts",
  "n_metag_only" = "MetaG-only KO counts",
  "n_metat_only" = "MetaT-only KO counts"
)

for (cat in names(cat_labels)) {
  label <- cat_labels[[cat]]

  # 1) log1p transform of the chosen category
  df_log <- eco_pathway %>%
    mutate(log1p_cat = log1p(.data[[cat]])) %>%
    select(pathway_name, env_local_scale, log1p_cat)

  # 2) Make a wide matrix to compute totals and select top 30 pathways
  mat_wide <- df_log %>%
    tidyr::pivot_wider(
      names_from = env_local_scale,
      values_from = log1p_cat,
      values_fill = 0
    )

  # row sums across ecosystems -> order by total
  mat_ranked <- mat_wide %>%
    mutate(row_sum = rowSums(across(-pathway_name), na.rm = TRUE)) %>%
    arrange(desc(row_sum))

  top_paths <- head(mat_ranked$pathway_name, 30)

  # 3) Back to long for ggplot; order pathways by the computed ranking
  df_top <- df_log %>%
    filter(pathway_name %in% top_paths) %>%
    mutate(
      # reverse so the largest is at the top of the heatmap
      pathway_name = factor(pathway_name, levels = rev(top_paths))
    )

  if (nrow(df_top) == 0) {
    message("No data to plot for category: ", cat)
    next
  }

  # 4) Heatmap
  p <- ggplot(df_top, aes(x = env_local_scale, y = pathway_name, fill = log1p_cat)) +
    geom_tile() +
    labs(
      title = paste("Pathway", label, "by ecosystem"),
      x = "Ecosystem",
      y = "Pathway",
      fill = "log1p Avg KO count"
    ) +
    # Use a perceptually uniform scale if available; falls back to default if not
    suppressWarnings(scale_fill_viridis_c()) +
    theme_minimal(base_size = 12) +
    theme(
      axis.text.x = element_text(size = 14),
      plot.title  = element_text(hjust = 0.5)
    )
  # Show in-session
  print(p)
}

# Prefer existing CSV if it exists
f_expr <- file.path(CACHE_DIR, "pathway_expression_by_sample.csv")
if (file.exists(f_expr)) {
  pathway_expr <- readr::read_csv(f_expr, show_col_types = FALSE)
} else if (!exists("pathway_expr")) {
  stop("pathway_expression_by_sample.csv not found and object `pathway_expr` not in memory.")
}

# Check metadata
stopifnot(exists("biosample_metadata"))
stopifnot(all(c("biosample_id", "env_local_scale") %in% names(biosample_metadata)))

#$ Merge metadata -
pathway_expr_meta <- pathway_expr %>%
  left_join(
    biosample_metadata %>% select(biosample_id, env_local_scale),
    by = "biosample_id"
  )

#  Aggregate by ecosystem × pathway
eco_pathway_expr <- pathway_expr_meta %>%
  group_by(env_local_scale, pathway_name, pathway_id) %>%
  summarise(expr_sum = mean(expr_sum, na.rm = TRUE), .groups = "drop")

# ---- Nothing to plot? ----
if (!exists("pathway_expr") || nrow(pathway_expr) == 0) {
  message("No pathway_expr data available; nothing to plot.")
} else {
  # 1) log1p transform
  mat <- eco_pathway_expr %>%
    mutate(log_expr = log1p(as.numeric(expr_sum))) %>%
    select(pathway_name, env_local_scale, log_expr)

  # 2) Pivot to wide to compute totals and rank pathways
  heat_wide <- mat %>%
    tidyr::pivot_wider(
    names_from  = env_local_scale,
    values_from = log_expr,
    values_fn   = list(log_expr = ~ mean(.x, na.rm = TRUE)),   # <-- aggregator
    values_fill = list(log_expr = 0)                            # <-- named list
  )


  # Sum across ecosystems; order desc; take top 100
  heat_ranked <- heat_wide %>%
    mutate(total = rowSums(across(-pathway_name), na.rm = TRUE)) %>%
    arrange(desc(total))

  top_paths <- head(heat_ranked$pathway_name, 100)

  # 3) Back to long for ggplot; order pathways so biggest totals at top
  heat_long <- mat %>%
    filter(pathway_name %in% top_paths) %>%
    mutate(
      # keep the ranked order; reverse for top at the top of the plot
      pathway_name = factor(pathway_name, levels = rev(top_paths))
    )

  # 4) Heatmap
  p <- ggplot(heat_long, aes(x = env_local_scale, y = pathway_name, fill = log_expr)) +
    geom_tile() +
    labs(
      title = "KEGG pathway expression (MetaT) across Ecosystems (top 100 pathways by total expression)",
      x = "Ecosystem",
      y = "Pathway",
      fill = "log1p(expr_sum)"
    ) +
    suppressWarnings(scale_fill_viridis_c()) +
    theme_minimal(base_size = 12) +
    theme(
      axis.text.x = element_text(size = 14),
      plot.title  = element_text(hjust = 0.5)
    )
    # Adjust plot size for better visibility in R notebooks
  options(repr.plot.width = 16, repr.plot.height = 12)
  print(p)
}

# ---- Function: Kruskal–Wallis per pathway ----
kruskal_wallis_across_samples <- function(pathway_expr) {
  if (is.null(pathway_expr) || nrow(pathway_expr) == 0) {
    return(tibble(
      pathway_id = character(),
      pathway_name = character(),
      p_value = numeric(),
      q_value = numeric(),
      n_samples = integer()
    ))
  }

  # Group by pathway and test across biosamples
  results <- pathway_expr %>%
    group_by(pathway_id, pathway_name) %>%
    summarise(
      n_samples = n_distinct(biosample_id[!is.na(expr_sum)]),
      p_value = {
        if (n_samples >= 2) {
          # Kruskal–Wallis across biosamples for this pathway
          tryCatch(stats::kruskal.test(expr_sum ~ biosample_id, data = cur_data())$p.value,
                   error = function(e) NA_real_)
        } else {
          NA_real_
        }
      },
      .groups = "drop"
    ) %>%
    arrange(p_value)

  # Multiple testing correction (similar to BH)
  if (sum(!is.na(results$p_value)) > 0) {
    results <- results %>%
      arrange(p_value) %>%
      mutate(
        rank = rank(p_value, ties.method = "first"),
        m    = sum(!is.na(p_value)),
        q_value = pmin(1, p_value * m / rank)
      ) %>%
      select(-rank, -m)
  } else {
    results$q_value <- NA_real_
  }

  results %>%
    arrange(q_value)
}

# ---- Run and save ----
kw_df <- kruskal_wallis_across_samples(pathway_expr)

# Preview
print(head(kw_df, 20))

Warning message:
“There was 1 warning in `summarise()`.
ℹ In argument: `p_value = { ... }`.
ℹ In group 1: `pathway_id = "map00010"` `pathway_name = "Glycolysis /
  Gluconeogenesis [PATH:ko00010]"`.
Caused by warning:
! `cur_data()` was deprecated in dplyr 1.1.0.
ℹ Please use `pick()` instead.”

# A tibble: 20 × 5
   pathway_id pathway_name                             n_samples p_value q_value
   <chr>      <chr>                                        <int>   <dbl>   <dbl>
 1 map04913   Ovarian steroidogenesis [PATH:ko04913]          24   0.461   0.461
 2 map00563   Glycosylphosphatidylinositol (GPI)-anch…        24   0.461   0.462
 3 map00535   Proteoglycans [BR:ko00535]                      24   0.461   0.462
 4 map99983   Lipid metabolism                                24   0.461   0.463
 5 map05340   Primary immunodeficiency [PATH:ko05340]         24   0.461   0.464
 6 map05221   Acute myeloid leukemia [PATH:ko05221]           24   0.461   0.465
 7 map05217   Basal cell carcinoma [PATH:ko05217]             24   0.461   0.466
 8 map05150   Staphylococcus aureus infection [PATH:k…        24   0.461   0.467
 9 map04977   Vitamin digestion and absorption [PATH:…        24   0.461   0.468
10 map04933   AGE-RAGE signaling pathway in diabetic …        24   0.461   0.469
11 map04744   Phototransduction [PATH:ko04744]                24   0.461   0.469
12 map04740   Olfactory transduction [PATH:ko04740]           24   0.461   0.470
13 map04713   Circadian entrainment [PATH:ko04713]            24   0.461   0.471
14 map04711   Circadian rhythm - fly [PATH:ko04711]           24   0.461   0.472
15 map04624   Toll and Imd signaling pathway [PATH:ko…        24   0.461   0.473
16 map04550   Signaling pathways regulating pluripote…        24   0.461   0.474
17 map04515   Cell adhesion molecules [BR:ko04515]            24   0.461   0.475
18 map04512   ECM-receptor interaction [PATH:ko04512]         24   0.461   0.476
19 map04392   Hippo signaling pathway - multiple spec…        24   0.461   0.477
20 map04215   Apoptosis - multiple species [PATH:ko04…        24   0.461   0.478

if (nrow(pathway_expr) > 0) {
  # ensure numeric
  pathway_expr <- mutate(pathway_expr, expr_sum = as.numeric(expr_sum))

  # ---- Rank pathways by total expression; keep top 20 ----
  ranks <- pathway_expr %>%
    group_by(pathway_id, pathway_name) %>%
    summarise(total_expr = sum(expr_sum, na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(total_expr)) %>%
    slice_head(n = 20)

  top_ids <- ranks$pathway_id

  sub <- pathway_expr %>%
    filter(pathway_id %in% top_ids) %>%
    # order x-axis by the ranking
    mutate(
      pathway_name = factor(
        pathway_name,
        levels = ranks$pathway_name  # already ordered by total_expr desc
      )
    )

  # ---- Violin + box + points (like px.violin(..., box=True, points="all")) ----
  p <- ggplot(sub, aes(x = pathway_name, y = expr_sum)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.15, outlier.shape = NA) +
    geom_jitter(width = 0.15, alpha = 0.5, size = 0.8) +
    labs(
      title = "Expression distributions for top 20 pathways (MetaT)",
      x = "Pathway",
      y = "Expression (expr_sum)"
    ) +
    theme_bw(base_size = 12) +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      plot.title  = element_text(hjust = 0.5)
    )

  print(p)
}

# ============================================================
# 1) Parse lineage table from a lineage URL
#    (on-demand using load_lineage_table)
# ============================================================
parse_lineage_from_url <- function(lineage_url) {
  df_lin <- load_lineage_table(lineage_url)  # <- your cached/retry loader
  rank_cols <- c("kingdom","phylum","class","order","family","genus","species")

  if (is.null(df_lin) || nrow(df_lin) == 0) {
    return(tibble(
      gene_id = character(), lineage = character(),
      !!!setNames(rep(list(character()), 7), rank_cols)
    ))
  }

  if (!all(c("gene_id","lineage") %in% names(df_lin))) {
    stop("Lineage file missing gene_id or lineage column.")
  }

  # Split lineage into up to 7 levels (semicolon or pipe)
  split_list <- str_split(df_lin$lineage, pattern = "[;|]\\s*")
  padded <- lapply(split_list, function(parts) {
    parts <- parts[!is.na(parts) & nzchar(parts)]
    out <- rep(NA_character_, 7)
    n <- min(length(parts), 7L)
    if (n > 0) out[seq_len(n)] <- trimws(parts[seq_len(n)])
    out
  })

  rank_df <- as_tibble(do.call(rbind, padded), .name_repair = "minimal")
  names(rank_df) <- rank_cols

  tibble(
    gene_id = as.character(df_lin$gene_id),
    lineage = df_lin$lineage
  ) %>%
    bind_cols(rank_df)
}

# Parse one lineage table already loaded
.parse_lineage_df <- function(df_lin) {
  rank_cols <- c("kingdom","phylum","class","order","family","genus","species")
  if (is.null(df_lin) || nrow(df_lin) == 0) {
    return(tibble(gene_id = character(), lineage = character(),
                  !!!setNames(rep(list(character()), 7), rank_cols)))
  }
  if (!all(c("gene_id","lineage") %in% names(df_lin))) {
    stop("Lineage file missing gene_id or lineage column.")
  }

  split_list <- stringr::str_split(df_lin$lineage, pattern = "[;|]\\s*")
  padded <- lapply(split_list, function(parts) {
    parts <- parts[!is.na(parts) & nzchar(parts)]
    out <- rep(NA_character_, 7)
    n <- min(length(parts), 7L)
    if (n > 0) out[seq_len(n)] <- trimws(parts[seq_len(n)])
    out
  })
  rank_df <- as_tibble(do.call(rbind, padded), .name_repair = "minimal")
  names(rank_df) <- rank_cols

  tibble(
    gene_id = as.character(df_lin$gene_id),
    lineage = df_lin$lineage
  ) %>% bind_cols(rank_df)
}

# 1) Build lineage cache ONCE per biosample (no rank dependency)
# datastore_index must have: biosample_id, metag_lineage_url, metat_lineage_url
build_lineage_cache <- function(datastore_index) {
  stopifnot(all(c("biosample_id","metag_lineage_url","metat_lineage_url") %in% names(datastore_index)))

  cache <- vector("list", nrow(datastore_index))
  for (i in seq_len(nrow(datastore_index))) {
    bsid <- as.character(datastore_index$biosample_id[i])
    mg   <- tryCatch(load_lineage_table(datastore_index$metag_lineage_url[i]), error = function(e) NULL)
    mt   <- tryCatch(load_lineage_table(datastore_index$metat_lineage_url[i]), error = function(e) NULL)
    cache[[i]] <- list(
      metag = .parse_lineage_df(mg),
      metat = .parse_lineage_df(mt)
    )
    names(cache)[i] <- bsid
  }
  cache
}

# 2) Main function: taxonomy vs expression (for ONE rank) + overlap (for MANY ranks)
#    - Will not reload lineage if you pass lineage_cache from a previous call.
taxonomy_vs_expression_on_demand <- function(datastore_index,
                                            rank = "genus",
                                            ranks_for_overlap = c("kingdom","phylum","class","order","family","genus","species"),
                                            lineage_cache = NULL,
                                            save_outputs = TRUE) {
  stopifnot(all(c("biosample_id","metag_lineage_url","metat_lineage_url","metat_expr_url") %in% names(datastore_index)))

  # Build lineage cache once if not supplied
  if (is.null(lineage_cache)) {
    lineage_cache <- build_lineage_cache(datastore_index)
  }

  mg_rows <- list()
  mt_rows <- list()
  st_rows <- list()
  overlap_rows <- list()

  for (i in seq_len(nrow(datastore_index))) {
    bsid <- as.character(datastore_index$biosample_id[i])
    mt_expr_url <- datastore_index$metat_expr_url[i]

    mg_lin <- lineage_cache[[bsid]]$metag
    mt_lin <- lineage_cache[[bsid]]$metat

    # ---- (A) Rank-specific: MetaG counts per taxon ----
    if (rank %in% names(mg_lin) && nrow(mg_lin) > 0) {
      mg_counts <- mg_lin %>%
        filter(!is.na(.data[[rank]]), nzchar(.data[[rank]])) %>%
        group_by(.data[[rank]]) %>%
        summarise(count = n_distinct(gene_id), .groups = "drop") %>%
        transmute(biosample_id = bsid, taxon = .data[[rank]], count = as.integer(count))
      if (nrow(mg_counts)) mg_rows[[length(mg_rows) + 1L]] <- mg_counts
    }

    # ---- (B) Rank-specific: MetaT expression per taxon ----
    expr <- tryCatch(parse_expression_from_url(mt_expr_url), error = function(e) NULL)
    if (!is.null(expr) && (rank %in% names(mt_lin)) && nrow(mt_lin) > 0 && nrow(expr) > 0) {
      mt_join <- mt_lin %>% left_join(expr, by = "gene_id")
      mt_sum <- mt_join %>%
        filter(!is.na(.data[[rank]]), nzchar(.data[[rank]])) %>%
        group_by(.data[[rank]]) %>%
        summarise(expr_sum = sum(expr_value, na.rm = TRUE), .groups = "drop") %>%
        transmute(biosample_id = bsid, taxon = .data[[rank]], expr_sum = as.numeric(expr_sum))
      if (nrow(mt_sum)) mt_rows[[length(mt_rows) + 1L]] <- mt_sum
    }

    # ---- (C) Rank-specific: Spearman (counts vs expression) ----
    if (!is.null(expr) && (rank %in% names(mg_lin)) && (rank %in% names(mt_lin)) &&
        nrow(mg_lin) > 0 && nrow(mt_lin) > 0 && nrow(expr) > 0) {

      mgc <- mg_lin %>%
        filter(!is.na(.data[[rank]]), nzchar(.data[[rank]])) %>%
        group_by(.data[[rank]]) %>%
        summarise(count = n_distinct(gene_id), .groups = "drop")

      mts <- mt_lin %>% left_join(expr, by = "gene_id") %>%
        filter(!is.na(.data[[rank]]), nzchar(.data[[rank]])) %>%
        group_by(.data[[rank]]) %>%
        summarise(expr_sum = sum(expr_value, na.rm = TRUE), .groups = "drop")

      both <- mgc %>% inner_join(mts, by = setNames(rank, rank)) %>%
        rename(taxon = !!rank)

      if (nrow(both) >= 3) {
        ct <- tryCatch(cor.test(both$count, both$expr_sum, method = "spearman"),
                       error = function(e) NULL)
        if (!is.null(ct)) {
          st_rows[[length(st_rows) + 1L]] <- tibble(biosample_id = bsid,
                                                    spearman_r = unname(ct$estimate),
                                                    p_value    = ct$p.value,
                                                    n_taxa     = nrow(both))
        } else {
          st_rows[[length(st_rows) + 1L]] <- tibble(biosample_id = bsid,
                                                    spearman_r = NA_real_,
                                                    p_value    = NA_real_,
                                                    n_taxa     = nrow(both))
        }
      } else {
        st_rows[[length(st_rows) + 1L]] <- tibble(biosample_id = bsid,
                                                  spearman_r = NA_real_,
                                                  p_value    = NA_real_,
                                                  n_taxa     = nrow(both))
      }
    }

    # ---- (D) Overlap across MANY ranks (no extra loads) ----
    if (nrow(mg_lin) > 0 || nrow(mt_lin) > 0) {
      for (rk in ranks_for_overlap) {
        if (!(rk %in% names(mg_lin)) || !(rk %in% names(mt_lin))) next

        mg_set <- unique(na.omit(mg_lin[[rk]]))
        mt_set <- unique(na.omit(mt_lin[[rk]]))
        shared <- intersect(mg_set, mt_set)
        uni    <- union(mg_set, mt_set)
        jacc   <- if (length(uni) > 0) length(shared) / length(uni) else NA_real_

        overlap_rows[[length(overlap_rows) + 1L]] <- tibble(
          biosample_id  = bsid,
          rank          = rk,
          n_metag_taxa  = length(mg_set),
          n_metat_taxa  = length(mt_set),
          n_shared_taxa = length(shared),
          n_metag_only  = length(setdiff(mg_set, mt_set)),
          n_metat_only  = length(setdiff(mt_set, mg_set)),
          jaccard       = jacc
        )
      }
    }
  }
  metaG_counts <- if (length(mg_rows)) bind_rows(mg_rows) else
    tibble(biosample_id = character(), taxon = character(), count = integer())
  metaT_expr   <- if (length(mt_rows)) bind_rows(mt_rows) else
    tibble(biosample_id = character(), taxon = character(), expr_sum = numeric())
  spearman     <- if (length(st_rows)) bind_rows(st_rows) else
    tibble(biosample_id = character(), spearman_r = numeric(), p_value = numeric(), n_taxa = integer())
  tax_overlap_df <- if (length(overlap_rows)) bind_rows(overlap_rows) else
    tibble(biosample_id  = character(), rank = character(),
           n_metag_taxa  = integer(), n_metat_taxa = integer(),
           n_shared_taxa = integer(), n_metag_only = integer(), n_metat_only = integer(),
           jaccard       = numeric())

  # Save (optional)
  if (isTRUE(save_outputs)) {
    readr::write_csv(metaG_counts,  file.path(CACHE_DIR, paste0("taxonomy_metaG_counts_", rank, ".csv")))
    readr::write_csv(metaT_expr,    file.path(CACHE_DIR, paste0("taxonomy_metaT_expr_", rank, ".csv")))
    readr::write_csv(spearman,      file.path(CACHE_DIR, paste0("taxonomy_spearman_", rank, ".csv")))
    readr::write_csv(tax_overlap_df, file.path(CACHE_DIR, "taxonomy_overlap_summary.csv"))
  }

  list(
    metaG_counts   = metaG_counts,
    metaT_expr     = metaT_expr,
    spearman       = spearman,
    tax_overlap_df = tax_overlap_df,
    lineage_cache  = lineage_cache  # return so you can reuse without reloading
  )
}

if(file.exists(file.path(CACHE_DIR, "taxonomy_metaG_counts_kingdom.csv"))
    && file.exists(file.path(CACHE_DIR, "taxonomy_metaT_expr_kingdom.csv"))
    && file.exists(file.path(CACHE_DIR, "taxonomy_spearman_kingdom.csv"))
    && file.exists(file.path(CACHE_DIR, "taxonomy_overlap_summary.csv"))
    && !isTRUE(FORCE_RUN)) {
  metaG_counts_kingdom <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_metaG_counts_kingdom.csv"), show_col_types = FALSE)
  metaT_expr_kingdom   <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_metaT_expr_kingdom.csv"), show_col_types = FALSE)
  tax_corr_kingdom     <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_spearman_kingdom.csv"), show_col_types = FALSE)
  tax_overlap_df       <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_overlap_summary.csv"), show_col_types = FALSE)
  message("Loaded existing taxonomy data from: ", file.path(CACHE_DIR, "taxonomy_metaG_counts_kingdom.csv"))
} else {
  res <- taxonomy_vs_expression_on_demand(datastore_index, rank = "kingdom", save_outputs = TRUE)
  metaG_counts_kingdom <- res$metaG_counts
  metaT_expr_kingdom   <- res$metaT_expr
  tax_corr_kingdom     <- res$spearman
  tax_overlap_df       <- res$tax_overlap_df
}
head(metaG_counts_kingdom, 10)
head(metaT_expr_kingdom, 10)
head(tax_corr_kingdom, 10)

if(file.exists(file.path(CACHE_DIR, "taxonomy_metaG_counts_phylum.csv"))
    && file.exists(file.path(CACHE_DIR, "taxonomy_metaT_expr_phylum.csv"))
    && file.exists(file.path(CACHE_DIR, "taxonomy_spearman_phylum.csv"))
    && !isTRUE(FORCE_RUN)) {
  metaG_counts_phylum <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_metaG_counts_phylum.csv"), show_col_types = FALSE)
  metaT_expr_phylum   <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_metaT_expr_phylum.csv"), show_col_types = FALSE)
  tax_corr_phylum     <- readr::read_csv(file.path(CACHE_DIR, "taxonomy_spearman_phylum.csv"), show_col_types = FALSE)
  message("Loaded existing taxonomy data from: ", file.path(CACHE_DIR, "taxonomy_metaG_counts_phylum.csv"))
} else {
  res_phylum <- taxonomy_vs_expression_on_demand(
    datastore_index,
    rank = "phylum",
    ranks_for_overlap = c("kingdom","phylum","class","order","family","genus","species"),
  lineage_cache = res_kingdom$lineage_cache,  # <-- reuse
  save_outputs = TRUE
  )
  metaG_counts_phylum <- res_phylum$metaG_counts
  metaT_expr_phylum   <- res_phylum$metaT_expr
  tax_corr_phylum     <- res_phylum$spearman
}
head(metaG_counts_phylum, 10)
head(metaT_expr_phylum, 10)
head(tax_corr_phylum, 10)

Loaded existing taxonomy data from: ../cached_data/taxonomy_metaG_counts_kingdom.csv

Loaded existing taxonomy data from: ../cached_data/taxonomy_metaG_counts_phylum.csv

options(repr.plot.width = 12, repr.plot.height = 8)

f_mg_king  <- file.path(CACHE_DIR, "taxonomy_metaG_counts_kingdom.csv")
f_mt_king  <- file.path(CACHE_DIR, "taxonomy_metaT_expr_kingdom.csv")
f_mg_phyl  <- file.path(CACHE_DIR, "taxonomy_metaG_counts_phylum.csv")
f_mt_phyl  <- file.path(CACHE_DIR, "taxonomy_metaT_expr_phylum.csv")

# ---------- Load inputs if files exist ----------
if (file.exists(f_mg_king))  metaG_counts_kingdom <- readr::read_csv(f_mg_king, show_col_types = FALSE)
if (file.exists(f_mt_king))  metaT_expr_kingdom   <- readr::read_csv(f_mt_king, show_col_types = FALSE)
if (file.exists(f_mg_phyl))  metaG_counts_phylum  <- readr::read_csv(f_mg_phyl, show_col_types = FALSE)
if (file.exists(f_mt_phyl))  metaT_expr_phylum    <- readr::read_csv(f_mt_phyl, show_col_types = FALSE)


#-----------------------------#
# Helper: bi-directional plot #
#-----------------------------#
# df must have: env_local_scale, taxon, <value_col>
bidir_barplot_gg <- function(df, value_col, title, top_n = 20,
                             png_file = NULL, pdf_file = NULL) {

  stopifnot(all(c("env_local_scale", "taxon", value_col) %in% names(df)))

  # Top taxa by global SUM (before any transforms)
  top_taxa <- df %>%
    group_by(taxon) %>%
    summarise(total = sum(.data[[value_col]], na.rm = TRUE), .groups = "drop") %>%
    arrange(desc(total)) %>%
    slice_head(n = top_n) %>%
    pull(taxon)

  sub <- df %>%
    filter(taxon %in% top_taxa) %>%
    mutate(taxon = factor(taxon, levels = rev(top_taxa)))  # show top at top after coord_flip

  ecos <- unique(na.omit(sub$env_local_scale))
  if (length(ecos) != 2) {
    message("[INFO] Found ", length(ecos), " ecosystems (need 2 for bi-directional). Making grouped bars.")

    p <- sub %>%
      mutate(value = .data[[value_col]]) %>%
      ggplot(aes(x = taxon, y = value, fill = env_local_scale)) +
      geom_col(position = position_dodge(width = 0.8)) +
      coord_flip() +
      labs(title = title, x = "Taxon", y = value_col, fill = "Ecosystem") +
      theme_bw(base_size = 12) +
      theme(axis.text.x = element_text(size = 9),
            plot.title = element_text(hjust = 0.5))
  } else {
    eco1 <- ecos[1]; eco2 <- ecos[2]

    # Mirror one ecosystem negative; then log1p with sign
    sub2 <- sub %>%
      mutate(value_raw = .data[[value_col]],
             value_signed = if_else(env_local_scale == eco2, -value_raw, value_raw),
             value_log1p  = sign(value_signed) * log1p(abs(value_signed)))

    # Positive values stack above 0, negative below 0 (relative-like)
    p <- ggplot(sub2, aes(x = taxon, y = value_log1p, fill = env_local_scale)) +
      geom_col(position = "stack", width = 0.85) +
      coord_flip() +
      scale_y_continuous(labels = function(x) abs(x)) +
      labs(
        title = title,
        x = "Taxon",
        y = paste0("log1p(", value_col, ")"),
        fill = "Ecosystem"
      ) +
      theme_bw(base_size = 12) +
      theme(
        axis.text.x = element_text(size = 9),
        plot.title  = element_text(hjust = 0.5),
        panel.spacing = unit(0.8, "lines")
      )
  }

  # Save if paths provided
  if (!is.null(png_file)) ggsave(file.path(CACHE_DIR, png_file), plot = p, width = 10, height = 6, dpi = 300)
  if (!is.null(pdf_file)) ggsave(file.path(CACHE_DIR, pdf_file), plot = p, width = 10, height = 6)

  return(p)
}

#-------------------------------------------#
# KINGDOM level — MetaG counts (mean by eco) #
#-------------------------------------------#
metaG_counts_meta <- metaG_counts_kingdom %>%
  left_join(biosample_metadata %>% select(biosample_id, env_local_scale), by = "biosample_id")

agg_mg_kingdom <- metaG_counts_meta %>%
  group_by(env_local_scale, taxon) %>%
  summarise(count = mean(count, na.rm = TRUE), .groups = "drop")

p_mg_kingdom <- bidir_barplot_gg(
  agg_mg_kingdom, value_col = "count",
  title = "MetaG taxonomy counts (log1p, bi-directional by ecosystem)",
  top_n = 20,
  png_file = "bidir_barchart_metaG_kingdom_counts.png",
  pdf_file = "bidir_barchart_metaG_kingdom_counts.pdf"
)
print(p_mg_kingdom)

#---------------------------------------------#
# KINGDOM level — MetaT expression (mean eco) #
#---------------------------------------------#
metaT_expr_meta <- metaT_expr_kingdom %>%
  left_join(biosample_metadata %>% select(biosample_id, env_local_scale), by = "biosample_id")

agg_mt_kingdom <- metaT_expr_meta %>%
  group_by(env_local_scale, taxon) %>%
  summarise(expr_sum = mean(expr_sum, na.rm = TRUE), .groups = "drop")

p_mt_kingdom <- bidir_barplot_gg(
  agg_mt_kingdom, value_col = "expr_sum",
  title = "MetaT taxonomy expression (log1p, bi-directional by ecosystem)",
  top_n = 20,
  png_file = "bidir_barchart_metaT_kingdom_expression.png",
  pdf_file = "bidir_barchart_metaT_kingdom_expression.pdf"
)
print(p_mt_kingdom)

#------------------------------------------#
# PHYLUM level — MetaG counts (mean by eco)#
#------------------------------------------#
metaG_counts_meta_ph <- metaG_counts_phylum %>%
  left_join(biosample_metadata %>% select(biosample_id, env_local_scale), by = "biosample_id")

agg_mg_phylum <- metaG_counts_meta_ph %>%
  group_by(env_local_scale, taxon) %>%
  summarise(count = mean(count, na.rm = TRUE), .groups = "drop")

p_mg_phylum <- bidir_barplot_gg(
  agg_mg_phylum, value_col = "count",
  title = "MetaG taxonomy counts (log1p, bi-directional by ecosystem, phylum)",
  top_n = 50,
  png_file = "bidir_barchart_metaG_phylum_counts.png",
  pdf_file = "bidir_barchart_metaG_phylum_counts.pdf"
)
print(p_mg_phylum)

#--------------------------------------------#
# PHYLUM level — MetaT expression (mean eco) #
#--------------------------------------------#
metaT_expr_meta_ph <- metaT_expr_phylum %>%
  left_join(biosample_metadata %>% select(biosample_id, env_local_scale), by = "biosample_id")

agg_mt_phylum <- metaT_expr_meta_ph %>%
  group_by(env_local_scale, taxon) %>%
  summarise(expr_sum = mean(expr_sum, na.rm = TRUE), .groups = "drop")

p_mt_phylum <- bidir_barplot_gg(
  agg_mt_phylum, value_col = "expr_sum",
  title = "MetaT taxonomy expression (log1p, bi-directional by ecosystem, phylum)",
  top_n = 50,
  png_file = "bidir_barchart_metaT_phylum_expression.png",
  pdf_file = "bidir_barchart_metaT_phylum_expression.pdf"
)
print(p_mt_phylum)

in_csv     <- file.path(CACHE_DIR, "taxonomy_overlap_summary.csv")

# ---------- Load tax_overlap_df (prefer file if available) ----------
if (file.exists(in_csv)) {
  tax_overlap_df <- readr::read_csv(in_csv, show_col_types = FALSE)
} else if (!exists("tax_overlap_df")) {
  stop("taxonomy_overlap_summary.csv not found and object `tax_overlap_df` not in memory.")
}

# ---------- Merge metadata ----------
stopifnot(exists("biosample_metadata"))
stopifnot(all(c("biosample_id","env_local_scale") %in% names(biosample_metadata)))

tax_overlap <- tax_overlap_df %>%
  left_join(biosample_metadata %>% select(biosample_id, env_local_scale),
            by = "biosample_id")

# ---------- Aggregate by ecosystem × rank (mean) ----------
agg_overlap <- tax_overlap %>%
  group_by(env_local_scale, rank) %>%
  summarise(
    n_shared_taxa = mean(n_shared_taxa, na.rm = TRUE),
    n_metag_only  = mean(n_metag_only,  na.rm = TRUE),
    n_metat_only  = mean(n_metat_only,  na.rm = TRUE),
    .groups = "drop"
  )

# Rank order for plotting
rank_order <- c("kingdom","phylum","class","order","family","genus","species")

# ---------- Plot per rank (grouped bars) ----------
if (nrow(agg_overlap) > 0) {
  for (r in rank_order) {
    sub <- agg_overlap %>% filter(rank == r)
    if (nrow(sub) == 0) next

    df_long <- sub %>%
      pivot_longer(
        cols = c(n_metag_only, n_metat_only, n_shared_taxa),
        names_to = "category",
        values_to = "mean_count"
      ) %>%
      mutate(
        env_local_scale = factor(env_local_scale),
        category = factor(category,
                          levels = c("n_metag_only", "n_metat_only", "n_shared_taxa"),
                          labels = c("MetaG only", "MetaT only", "Shared taxa"))
      )

    p <- ggplot(df_long, aes(x = env_local_scale, y = mean_count, fill = category)) +
      geom_col(position = position_dodge(width = 0.75)) +
      labs(
        title = paste0("Taxonomy overlap (MetaG vs MetaT) — ", str_to_title(r)),
        x = "Ecosystem",
        y = "Mean count",
        fill = "Category"
      ) +
      scale_fill_manual(values = c("MetaG only" = "blue",
                                   "MetaT only" = "orange",
                                   "Shared taxa" = "green")) +
      theme_bw(base_size = 12) +
      theme(
        axis.text.x = element_text(size=14),
        plot.title = element_text(hjust = 0.5)
      )

    print(p)
  }
} else {
  message("agg_overlap is empty; nothing to plot.")
}

# ---- Load tax_overlap_df if available ----
if (file.exists(in_csv)) {
  tax_overlap_df <- readr::read_csv(in_csv, show_col_types = FALSE)
} else if (!exists("tax_overlap_df")) {
  stop("taxonomy_overlap_summary.csv not found and object `tax_overlap_df` not in memory.")
}

# ---- Merge metadata ----
stopifnot(exists("biosample_metadata"))
stopifnot(all(c("biosample_id","env_local_scale") %in% names(biosample_metadata)))

tax_with_meta <- tax_overlap_df %>%
  left_join(biosample_metadata %>% select(biosample_id, env_local_scale),
            by = "biosample_id")

# ---- Rank order for plotting ----
rank_order <- c("kingdom","phylum","class","order","family","genus","species")

# =========================
# 1) Boxplot of Jaccard by ecosystem (color = rank)
# =========================
if (nrow(tax_with_meta) > 0) {
  p_box <- ggplot(
    tax_with_meta %>% mutate(rank = factor(rank, levels = rank_order)),
    aes(x = env_local_scale, y = jaccard, color = rank)
  ) +
    geom_boxplot(outlier.shape = NA, position = position_dodge2(preserve = "single")) +
    geom_jitter(width = 0.15, alpha = 0.5, size = 1) +
    labs(
      title = "Taxonomy Jaccard index by ecosystem and rank",
      x = "Ecosystem",
      y = "Jaccard index",
      color = "Rank"
    ) +
    theme_bw(base_size = 12) +
    theme(plot.title = element_text(hjust = 0.5))
  print(p_box)
} else {
  message("tax_with_meta is empty; skipping boxplot.")
}

# =========================
# 2) Heatmap of Jaccard (rows = rank, columns = biosample)
# =========================
if (nrow(tax_overlap_df) > 0) {
  # Keep only ranks that appear, in the requested order
  present_ranks <- rank_order[rank_order %in% unique(tax_overlap_df$rank)]

  heat_long <- tax_overlap_df %>%
    filter(rank %in% present_ranks) %>%
    mutate(rank = factor(rank, levels = present_ranks)) %>%
    select(rank, biosample_id, jaccard)

  p_heat <- ggplot(heat_long, aes(x = biosample_id, y = rank, fill = jaccard)) +
    geom_tile() +
    labs(
      title = "Taxonomy Jaccard similarity (MetaG vs MetaT)",
      x = "Biosample",
      y = "Rank",
      fill = "Jaccard"
    ) +
    suppressWarnings(scale_fill_viridis_c(na.value = "grey90")) +
    theme_minimal(base_size = 12) +
    theme(
      axis.text.x = element_text(angle = 45, hjust = 1),
      plot.title  = element_text(hjust = 0.5)
    )

  print(p_heat)
} else {
  message("tax_overlap_df is empty; skipping heatmap.")
}

Value	Description
Gene Phylogeny tsv	Tab-delimited file of gene phylogeny
Annotation KEGG Orthology	Tab delimited file for KO annotation
Metatranscriptome Expression	Read count table output

biosample_id	MetaT Gene Phylogeny tsv	MetaG Annotation KEGG Orthology	MetaT Annotation KEGG Orthology	MetaG Gene Phylogeny tsv	Metatranscriptome Expression
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
nmdc:bsm-11-042nd237	https://data.microbiomedata.org/data/nmdc:dgns-11-2rrxzb57/nmdc:wfmtan-11-ryv54x73.1/nmdc_wfmtan-11-ryv54x73.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-zayj8368/nmdc:wfmgan-11-fw38xr59.1/nmdc_wfmgan-11-fw38xr59.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-2rrxzb57/nmdc:wfmtan-11-ryv54x73.1/nmdc_wfmtan-11-ryv54x73.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-zayj8368/nmdc:wfmgan-11-fw38xr59.1/nmdc_wfmgan-11-fw38xr59.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-2rrxzb57/nmdc:wfmtex-11-n2w8c804.1/nmdc_wfmtex-11-n2w8c804.1.rnaseq_gea.txt
nmdc:bsm-11-3p2mnz08	https://data.microbiomedata.org/data/nmdc:dgns-11-sfhj5f74/nmdc:wfmtan-11-qn0rff14.1/nmdc_wfmtan-11-qn0rff14.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-yxhzrq69/nmdc:wfmgan-11-t597cr38.1/nmdc_wfmgan-11-t597cr38.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-sfhj5f74/nmdc:wfmtan-11-qn0rff14.1/nmdc_wfmtan-11-qn0rff14.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-yxhzrq69/nmdc:wfmgan-11-t597cr38.1/nmdc_wfmgan-11-t597cr38.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-sfhj5f74/nmdc:wfmtex-11-t5fpwf12.1/nmdc_wfmtex-11-t5fpwf12.1.rnaseq_gea.txt
nmdc:bsm-11-622k6044	https://data.microbiomedata.org/data/nmdc:dgns-11-yqheg664/nmdc:wfmtan-11-jbbpdz62.1/nmdc_wfmtan-11-jbbpdz62.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-0g2bvk46/nmdc:wfmgan-11-46sdce04.1/nmdc_wfmgan-11-46sdce04.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-yqheg664/nmdc:wfmtan-11-jbbpdz62.1/nmdc_wfmtan-11-jbbpdz62.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-0g2bvk46/nmdc:wfmgan-11-46sdce04.1/nmdc_wfmgan-11-46sdce04.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-yqheg664/nmdc:wfmtex-11-thfqzx80.1/nmdc_wfmtex-11-thfqzx80.1.rnaseq_gea.txt
nmdc:bsm-11-65a4xw75	https://data.microbiomedata.org/data/nmdc:dgns-11-mw1vp557/nmdc:wfmtan-11-qcnn8k31.1/nmdc_wfmtan-11-qcnn8k31.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-w6m8tk08/nmdc:wfmgan-11-mmxw6483.1/nmdc_wfmgan-11-mmxw6483.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-mw1vp557/nmdc:wfmtan-11-qcnn8k31.1/nmdc_wfmtan-11-qcnn8k31.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-w6m8tk08/nmdc:wfmgan-11-mmxw6483.1/nmdc_wfmgan-11-mmxw6483.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-mw1vp557/nmdc:wfmtex-11-3h5a0x22.1/nmdc_wfmtex-11-3h5a0x22.1.rnaseq_gea.txt
nmdc:bsm-11-6m6c9g55	https://data.microbiomedata.org/data/nmdc:dgns-11-kvhxfz39/nmdc:wfmtan-11-zr9wpb59.1/nmdc_wfmtan-11-zr9wpb59.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-4k8jak62/nmdc:wfmgan-11-e4b9rk86.1/nmdc_wfmgan-11-e4b9rk86.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-kvhxfz39/nmdc:wfmtan-11-zr9wpb59.1/nmdc_wfmtan-11-zr9wpb59.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-4k8jak62/nmdc:wfmgan-11-e4b9rk86.1/nmdc_wfmgan-11-e4b9rk86.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-kvhxfz39/nmdc:wfmtex-11-57x2hq28.1/nmdc_wfmtex-11-57x2hq28.1.rnaseq_gea.txt
nmdc:bsm-11-6x067s27	https://data.microbiomedata.org/data/nmdc:dgns-11-hngnz807/nmdc:wfmtan-11-9jpgyv22.1/nmdc_wfmtan-11-9jpgyv22.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-t2ejqy28/nmdc:wfmgan-11-zqn9em82.1/nmdc_wfmgan-11-zqn9em82.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-hngnz807/nmdc:wfmtan-11-9jpgyv22.1/nmdc_wfmtan-11-9jpgyv22.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-t2ejqy28/nmdc:wfmgan-11-zqn9em82.1/nmdc_wfmgan-11-zqn9em82.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-hngnz807/nmdc:wfmtex-11-zx7prv11.1/nmdc_wfmtex-11-zx7prv11.1.rnaseq_gea.txt
nmdc:bsm-11-71991e79	https://data.microbiomedata.org/data/nmdc:dgns-11-5my2te78/nmdc:wfmtan-11-0tmerk58.1/nmdc_wfmtan-11-0tmerk58.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-71xjrh06/nmdc:wfmgan-11-8s3ep328.1/nmdc_wfmgan-11-8s3ep328.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-5my2te78/nmdc:wfmtan-11-0tmerk58.1/nmdc_wfmtan-11-0tmerk58.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-71xjrh06/nmdc:wfmgan-11-8s3ep328.1/nmdc_wfmgan-11-8s3ep328.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-5my2te78/nmdc:wfmtex-11-6wtpyv38.1/nmdc_wfmtex-11-6wtpyv38.1.rnaseq_gea.txt
nmdc:bsm-11-7m09me46	https://data.microbiomedata.org/data/nmdc:dgns-11-vfdkag98/nmdc:wfmtan-11-17594f10.1/nmdc_wfmtan-11-17594f10.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-hwbrhz12/nmdc:wfmgan-11-1vkd7g49.1/nmdc_wfmgan-11-1vkd7g49.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-vfdkag98/nmdc:wfmtan-11-17594f10.1/nmdc_wfmtan-11-17594f10.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-hwbrhz12/nmdc:wfmgan-11-1vkd7g49.1/nmdc_wfmgan-11-1vkd7g49.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-vfdkag98/nmdc:wfmtex-11-yhpjmk50.1/nmdc_wfmtex-11-yhpjmk50.1.rnaseq_gea.txt
nmdc:bsm-11-7twwzs96	https://data.microbiomedata.org/data/nmdc:dgns-11-kf8gwm78/nmdc:wfmtan-11-xbk8hw28.1/nmdc_wfmtan-11-xbk8hw28.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-tsj32b65/nmdc:wfmgan-11-zxmjdh22.1/nmdc_wfmgan-11-zxmjdh22.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-kf8gwm78/nmdc:wfmtan-11-xbk8hw28.1/nmdc_wfmtan-11-xbk8hw28.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-tsj32b65/nmdc:wfmgan-11-zxmjdh22.1/nmdc_wfmgan-11-zxmjdh22.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-kf8gwm78/nmdc:wfmtex-11-tmsdtd95.1/nmdc_wfmtex-11-tmsdtd95.1.rnaseq_gea.txt
nmdc:bsm-11-8rn9bm20	https://data.microbiomedata.org/data/nmdc:dgns-11-hgn6td61/nmdc:wfmtan-11-wh0e3993.1/nmdc_wfmtan-11-wh0e3993.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-wj9yhw60/nmdc:wfmgan-11-njdr0e67.1/nmdc_wfmgan-11-njdr0e67.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-hgn6td61/nmdc:wfmtan-11-wh0e3993.1/nmdc_wfmtan-11-wh0e3993.1_ko.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-wj9yhw60/nmdc:wfmgan-11-njdr0e67.1/nmdc_wfmgan-11-njdr0e67.1_gene_phylogeny.tsv	https://data.microbiomedata.org/data/nmdc:dgns-11-hgn6td61/nmdc:wfmtex-11-zerk7d61.1/nmdc_wfmtex-11-zerk7d61.1.rnaseq_gea.txt

ko_id	pathway_id	pathway_name
<chr>	<chr>	<chr>
K00844	map00521	Streptomycin biosynthesis [PATH:ko00521]
K00844	map00500	Starch and sucrose metabolism [PATH:ko00500]
K00844	map04066	HIF-1 signaling pathway [PATH:ko04066]
K00844	map04131	Membrane trafficking [BR:ko04131]
K00844	map05131	Shigellosis [PATH:ko05131]
K00844	map00052	Galactose metabolism [PATH:ko00052]
K00844	map04930	Type II diabetes mellitus [PATH:ko04930]
K00844	map05230	Central carbon metabolism in cancer [PATH:ko05230]
K00844	map00051	Fructose and mannose metabolism [PATH:ko00051]
K00844	map00010	Glycolysis / Gluconeogenesis [PATH:ko00010]

biosample_id	taxon	count
<chr>	<chr>	<dbl>
nmdc:bsm-11-042nd237	Archaea	19897
nmdc:bsm-11-042nd237	Bacteria	6277682
nmdc:bsm-11-042nd237	Eukaryota	62269
nmdc:bsm-11-042nd237	Viruses	3131
nmdc:bsm-11-3p2mnz08	Archaea	10813
nmdc:bsm-11-3p2mnz08	Bacteria	5518853
nmdc:bsm-11-3p2mnz08	Eukaryota	54983
nmdc:bsm-11-3p2mnz08	Viruses	2285
nmdc:bsm-11-622k6044	Archaea	29377
nmdc:bsm-11-622k6044	Bacteria	5973947

biosample_id	taxon	expr_sum
<chr>	<chr>	<dbl>
nmdc:bsm-11-042nd237	Archaea	81199
nmdc:bsm-11-042nd237	Bacteria	26756923
nmdc:bsm-11-042nd237	Eukaryota	2891915
nmdc:bsm-11-042nd237	Viruses	355932
nmdc:bsm-11-3p2mnz08	Archaea	41923
nmdc:bsm-11-3p2mnz08	Bacteria	10766751
nmdc:bsm-11-3p2mnz08	Eukaryota	474197
nmdc:bsm-11-3p2mnz08	Viruses	199839
nmdc:bsm-11-622k6044	Archaea	80441
nmdc:bsm-11-622k6044	Bacteria	19398821

Identifying relationships between annotated omics data in NMDC¶

1. Retrieve data from the NMDC database using API endpoints¶

Choose the data you want to explore¶

Get and filter data for Jeff Blanchard’s Harvard forest soil project¶

Select data has both metaT and metaG analysis result¶

Download selected results files¶

2. KEGG Orthology (KO) Analysis¶

Shared vs unique KOs (MetaG vs MetaT)¶

3. KEGG Pathway Analysis¶

Pathway shared KO counts by env_local_scale¶

Pathway MetaG-only KO counts by env_local_scale¶

Pathway MetaT-only KO counts by env_local_scale¶

4. Taxonomy comparison: MetaG lineage vs MetaT lineage + expression¶

Kingdom level¶

Phylum level¶

Taxonomy Overlap Analysis¶

data_object_type	n
<chr>	<int>
Annotation KEGG Orthology	64
Gene Phylogeny tsv	64
Metatranscriptome Expression	39

A tibble: 10 × 4
biosample_id	spearman_r	p_value	n_taxa
<chr>	<dbl>	<dbl>	<dbl>
nmdc:bsm-11-042nd237	0.8	0.3333333	4
nmdc:bsm-11-3p2mnz08	0.8	0.3333333	4
nmdc:bsm-11-622k6044	0.8	0.3333333	4
nmdc:bsm-11-65a4xw75	0.8	0.3333333	4
nmdc:bsm-11-6m6c9g55	0.8	0.3333333	4
nmdc:bsm-11-6x067s27	0.8	0.3333333	4
nmdc:bsm-11-71991e79	0.8	0.3333333	4
nmdc:bsm-11-7m09me46	0.8	0.3333333	4
nmdc:bsm-11-7twwzs96	0.8	0.3333333	4
nmdc:bsm-11-8rn9bm20	0.8	0.3333333	4

A tibble: 10 × 3
biosample_id	taxon	count
<chr>	<chr>	<dbl>
nmdc:bsm-11-3p2mnz08	Acidobacteriota	1095773
nmdc:bsm-11-3p2mnz08	Actinomycetota	1171264
nmdc:bsm-11-3p2mnz08	Annelida	117
nmdc:bsm-11-3p2mnz08	Apicomplexa	148
nmdc:bsm-11-3p2mnz08	Aquificota	1394
nmdc:bsm-11-3p2mnz08	Armatimonadota	8120
nmdc:bsm-11-3p2mnz08	Arthropoda	607
nmdc:bsm-11-3p2mnz08	Artverviricota	5
nmdc:bsm-11-3p2mnz08	Ascomycota	21273
nmdc:bsm-11-3p2mnz08	Atribacterota	537

A tibble: 10 × 3
biosample_id	taxon	expr_sum
<chr>	<chr>	<dbl>
nmdc:bsm-11-042nd237	Acidobacteriota	7300041
nmdc:bsm-11-042nd237	Actinomycetota	2398193
nmdc:bsm-11-042nd237	Annelida	46849
nmdc:bsm-11-042nd237	Apicomplexa	17070
nmdc:bsm-11-042nd237	Aquificota	11600
nmdc:bsm-11-042nd237	Armatimonadota	60237
nmdc:bsm-11-042nd237	Arthropoda	37912
nmdc:bsm-11-042nd237	Ascomycota	133237
nmdc:bsm-11-042nd237	Atribacterota	1987
nmdc:bsm-11-042nd237	Bacillariophyta	2037

A tibble: 10 × 4
biosample_id	spearman_r	p_value	n_taxa
<chr>	<dbl>	<dbl>	<dbl>
nmdc:bsm-11-3p2mnz08	0.6007218	3.174067e-12	111
nmdc:bsm-11-65a4xw75	0.7217448	1.902488e-19	113
nmdc:bsm-11-6m6c9g55	0.6639791	6.055736e-16	115
nmdc:bsm-11-6x067s27	0.7219179	3.893382e-19	111
nmdc:bsm-11-7m09me46	0.6158524	1.024704e-12	109
nmdc:bsm-11-7twwzs96	0.5513729	2.974624e-10	112
nmdc:bsm-11-93mc8g67	0.6529755	6.091847e-15	112
nmdc:bsm-11-cwey4y35	0.6291956	8.431174e-14	113
nmdc:bsm-11-frgt4x11	0.6259273	2.635444e-13	110
nmdc:bsm-11-g61cvw79	0.6781972	1.016324e-15	107