# Load essential libraries
library(jsonlite, warn.conflicts=FALSE)
library(dplyr, warn.conflicts=FALSE)
library(tidyr, warn.conflicts=FALSE)
library(ggplot2, warn.conflicts=FALSE)
library(forcats, warn.conflicts=FALSE)
library(lubridate, warn.conflicts=FALSE)
if (!require('maps', quiet = TRUE)) install.packages('maps', quiet = TRUE); library('maps', warn.conflicts=FALSE)

base_url = "https://api.microbiomedata.org"
url = paste0(base_url, "/studies?filter=name.search%3ANational%20Ecological%20Observatory%20Network&per_page=50")

response = fromJSON(url)
study_ids = response[["results"]][["id"]]
print(study_ids)

[1] "nmdc:sty-11-34xj1150" "nmdc:sty-11-hht5sb92" "nmdc:sty-11-pzmd0x14"

study_id = study_ids[1]

# Prepare a tibble to hold results
dat_all = tibble()

# Set up query
per_page = 2000  # number of records to retrieve per page
filt = paste0("associated_studies:", study_id) # filter to only get biosamples from the study of interest
get_more = TRUE # flag to indicate whether we need to get more records
page = 1 # page number to retrieve
while (get_more){
    # construct the url for the query
    url = paste0(
        base_url, 
        "/biosamples?filter=", 
        filt,
        "&per_page=",
        per_page,
        "&page=",
        page)
    # get the data
    data = fromJSON(url)
    data_results = data[['results']] %>% as.data.frame() 
    # add the data to the tibble
    dat_all = bind_rows(dat_all, data_results)
    # check if we need to get more records
    if (nrow(dat_all) < data[['meta']]['count']){
        page = page +1
    } else { get_more = FALSE}
}


glimpse(dat_all)

Rows: 5,260
Columns: 44
$ analysis_type               <list> "metagenomics", "metagenomics", "metageno…
$ collection_date             <df[,2]> <data.frame[26 x 2]>
$ depth                       <df[,5]> <data.frame[26 x 5]>
$ elev                        <dbl> 586.1, 586.1, 586.1, 534.2, 534.2, 534.…
$ env_broad_scale             <df[,3]> <data.frame[26 x 3]>
$ env_local_scale             <df[,3]> <data.frame[26 x 3]>
$ env_medium                  <df[,3]> <data.frame[26 x 3]>
$ id                          <chr> "nmdc:bsm-11-06qrej20", "nmdc:bsm-11-ft…
$ name                        <chr> "ABBY_004-M-0.5-8-20170605", "ABBY_004-M-3…
$ ph                          <dbl> 5.51, 5.58, 5.53, 5.59, 4.70, 4.74, 4.9…
$ samp_collec_device          <chr> "corer", "corer", "corer", "corer", "br…
$ soil_horizon                <chr> "M horizon", "M horizon", "M horizon", …
$ temp                        <df[,4]> <data.frame[26 x 4]>
$ type                        <chr> "nmdc:Biosample", "nmdc:Biosample", "nmdc:…
$ water_content               <list> "0.465 g of water/g of dry soil", "0.346 g…
$ geo_loc_name                <df[,2]> <data.frame[26 x 2]>
$ biosample_categories        <list> "NEON", "NEON", "NEON", "NEON", "NEON", "N…
$ lat_lon                     <df[,4]> <data.frame[26 x 4]>
$ gold_biosample_identifiers  <list> "gold:Gb0255529", "gold:Gb0255529", "gold:…
$ ecosystem                   <chr> "Environmental", "Environmental", "Environ…
$ ecosystem_category          <chr> "Terrestrial", "Terrestrial", "Terrestr…
$ ecosystem_type              <chr> "Soil", "Soil", "Soil", "Soil", "Soil", "S…
$ ecosystem_subtype           <chr> "Unclassified", "Unclassified", "Unclassi…
$ associated_studies          <list> "nmdc:sty-11-34xj1150", "nmdc:sty-11-34…
$ env_package                 <df[,2]> <data.frame[26 x 2]>
$ specific_ecosystem          <chr> "Unclassified", "Unclassified", "Unclassi…
$ img_identifiers             <list> "img.taxon:3300040150", "img.taxon:3300…
$ ammonium_nitrogen           <df[,3]> <data.frame[26 x 3]>
$ org_carb                    <df[,3]> <data.frame[26 x 3]>
$ tot_nitro_content           <df[,3]> <data.frame[26 x 3]>
$ carb_nitro_ratio            <df[,3]> <data.frame[26 x 3]>
$ nitro                       <df[,3]> <data.frame[26 x 3]>
$ insdc_biosample_identifiers <list> <NULL>, <NULL>, <NULL>, <NULL>, <NULL>, <…
$ samp_taxon_id               <df[,3]> <data.frame[26 x 3]>
$ annual_precpt               <df[,4]> <data.frame[26 x 4]>
$ annual_temp                 <df[,4]> <data.frame[26 x 4]>
$ biotic_relationship         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ collection_time             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ growth_facil                <df[,2]> <data.frame[26 x 2]>
$ samp_name                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ samp_store_temp             <df[,4]> <data.frame[26 x 4]>
$ store_cond                  <df[,2]> <data.frame[26 x 2]>
$ alternative_names           <list> <NULL>, <NULL>, <NULL>, <NULL>, <NULL>, <N…
$ dna_isolate_meth            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

df <- dat_all %>%
    select(
      collection_date, soil_horizon, water_content, ph, geo_loc_name, lat_lon, env_medium
      ) %>%
    mutate(water_content = as.character(water_content)) %>%
    separate(water_content, sep = " ", into = c("water_content", "water_content_units"), extra = "merge") %>%
    mutate(water_content = as.numeric(water_content)) %>%
    unnest(
      cols = c(
          env_medium,
        collection_date, 
        geo_loc_name,
        lat_lon
        ), names_sep = "_") %>%
    unnest(
        cols = c(env_medium_term
        ), names_sep = "_") %>%
    rename(collection_date = collection_date_has_raw_value ,
           env_medium = env_medium_term_name,
           geo_loc = geo_loc_name_has_raw_value) %>%
    mutate(collection_date = as.Date(collection_date)) %>%
    suppressWarnings()
glimpse(df)

Rows: 5,260
Columns: 17
$ collection_date          <date> 2017-06-05, 2017-06-05, 2017-06-05, 2017-06-…
$ collection_date_type     <chr> "nmdc:TimestampValue", "nmdc:TimestampValue",…
$ soil_horizon             <chr> "M horizon", "M horizon", "M horizon", "M hor…
$ water_content            <dbl> 0.465, 0.346, 0.392, 0.461, 2.630, 2.876, 2.1…
$ water_content_units      <chr> "g of water/g of dry soil", "g of water/g of …
$ ph                       <dbl> 5.51, 5.58, 5.53, 5.59, 4.70, 4.74, 4.95, 4.2…
$ geo_loc                  <chr> "USA: Washington, Abby Road", "USA: Washingto…
$ geo_loc_name_type        <chr> "nmdc:TextValue", "nmdc:TextValue", "nmdc:Tex…
$ lat_lon_latitude         <dbl> 45.76858, 45.76858, 45.76858, 45.75405, 45.75…
$ lat_lon_longitude        <dbl> -122.2987, -122.2987, -122.2987, -122.2985, -…
$ lat_lon_type             <chr> "nmdc:GeolocationValue", "nmdc:GeolocationVal…
$ lat_lon_has_raw_value    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ env_medium_term_id       <chr> "ENVO:00001998", "ENVO:00001998", "ENVO:00001…
$ env_medium               <chr> "soil", "soil", "soil", "soil", "soil", "soil…
$ env_medium_term_type     <chr> "nmdc:OntologyClass", "nmdc:OntologyClass", "…
$ env_medium_type          <chr> "nmdc:ControlledIdentifiedTermValue", "nmdc:C…
$ env_medium_has_raw_value <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…

df_sum <- df %>%
  group_by(
    env_medium
    ) %>%
  summarise(
    count = n(),
    fraction_with_ph = sum(!is.na(ph))/n(),
    fraction_with_water_content = sum(!is.na(water_content))/n(),
    fraction_with_collection_date = sum(!is.na(collection_date))/n(),
    ) %>%
  arrange(desc(count))
df_sum

# Prepare location df data
loc_sum_df <- df %>%
  filter(!(is.na(ph))) %>%
  group_by(
    geo_loc,
    env_medium
    ) %>%
  mutate(
    count_with_ph = n(),
    lat_med = median(lat_lon_latitude),
    long_med = median(lat_lon_longitude),
    ) %>%
  select(
    geo_loc, 
    env_medium,
    lat_med,
    long_med,
    count_with_ph
    ) %>%
  distinct()

#Plot summary data
my_theme <- theme_bw()
world <- map_data("world")
g2 <- suppressWarnings(ggplot() +
  geom_map(
    data = world, 
    map = world,
    aes(long, lat, map_id = region),
    color = "white", fill = "lightgray", size = 0.1
  )  +
    geom_point(
        data = loc_sum_df,
        aes(long_med, lat_med,
        size = count_with_ph)) +
    theme_void() +
    labs(size = "# of biosamples with \n pH measurements")+
    coord_cartesian(xlim = c(-165, -65), ylim = c(15, 72), expand = FALSE))

options(repr.plot.width = 7, repr.plot.height = 5, repr.plot.res = 200)
g2

# Prep dataframe with new column of factored sites
df2 <- df %>%
  mutate(geo_loc_grouped = geo_loc %>% 
           factor() %>% 
           fct_lump(n = 8)
         ) %>%
  filter(geo_loc_grouped != "Other")


# Plot data
g <- suppressWarnings(ggplot(data = na.omit(df2)) +
    geom_point(aes(x=collection_date, y = ph, color = soil_horizon)) +
    my_theme +
    scale_x_date()+
    labs(x = "Collection Date", y = "pH")+
    facet_wrap(facets = vars(geo_loc_grouped),
               labeller = label_wrap_gen(width=30),
              nrow = 2))
options(repr.plot.width = 10, repr.plot.height = 5)
g

ERROR while rich displaying an object: Error in `combine_vars()`:
! Faceting variables must have at least one value.

Traceback:
1. sapply(x, f, simplify = simplify)
2. lapply(X = X, FUN = FUN, ...)
3. FUN(X[[i]], ...)
4. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
5. tryCatchList(expr, classes, parentenv, handlers)
6. tryCatchOne(expr, names, parentenv, handlers[[1L]])
7. doTryCatch(return(expr), name, parentenv, handler)
8. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler)
9. repr::mime2repr[[mime]](obj)
10. repr_text.default(obj)
11. paste(capture.output(print(obj)), collapse = "\n")
12. capture.output(print(obj))
13. withVisible(...elt(i))
14. print(obj)
15. `print.ggplot2::ggplot`(obj)
16. ggplot_build(x)
17. `ggplot_build.ggplot2::ggplot`(x)
18. layout$setup(data, plot@data, plot@plot_env)
19. setup(..., self = self)
20. self$facet$compute_layout(data, self$facet_params)
21. compute_layout(..., self = self)
22. combine_vars(data, params$plot_env, vars, drop = params$drop)
23. cli::cli_abort("Faceting variables must have at least one value.")
24. rlang::abort(message, ..., call = call, use_cli_format = TRUE, 
  .     .frame = .frame)
25. signal_abort(cnd, .file)

# Plot data
g <- ggplot(data = na.omit(df2)) +
    geom_point(aes(x=collection_date, y = water_content, color = soil_horizon)) +
    my_theme +
    scale_x_date()+
    labs(x = "Collection Date", y = "Water Content in g of water/g of dry soil")+
    facet_wrap(facets = vars(geo_loc_grouped),
               labeller = label_wrap_gen(width=30),
              nrow = 2) 
g

ERROR while rich displaying an object: Error in `combine_vars()`:
! Faceting variables must have at least one value.

Traceback:
1. sapply(x, f, simplify = simplify)
2. lapply(X = X, FUN = FUN, ...)
3. FUN(X[[i]], ...)
4. tryCatch(withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler), error = outer_handler)
5. tryCatchList(expr, classes, parentenv, handlers)
6. tryCatchOne(expr, names, parentenv, handlers[[1L]])
7. doTryCatch(return(expr), name, parentenv, handler)
8. withCallingHandlers({
 .     if (!mime %in% names(repr::mime2repr)) 
 .         stop("No repr_* for mimetype ", mime, " in repr::mime2repr")
 .     rpr <- repr::mime2repr[[mime]](obj)
 .     if (is.null(rpr)) 
 .         return(NULL)
 .     prepare_content(is.raw(rpr), rpr)
 . }, error = error_handler)
9. repr::mime2repr[[mime]](obj)
10. repr_text.default(obj)
11. paste(capture.output(print(obj)), collapse = "\n")
12. capture.output(print(obj))
13. withVisible(...elt(i))
14. print(obj)
15. `print.ggplot2::ggplot`(obj)
16. ggplot_build(x)
17. `ggplot_build.ggplot2::ggplot`(x)
18. layout$setup(data, plot@data, plot@plot_env)
19. setup(..., self = self)
20. self$facet$compute_layout(data, self$facet_params)
21. compute_layout(..., self = self)
22. combine_vars(data, params$plot_env, vars, drop = params$drop)
23. cli::cli_abort("Faceting variables must have at least one value.")
24. rlang::abort(message, ..., call = call, use_cli_format = TRUE, 
  .     .frame = .frame)
25. signal_abort(cnd, .file)

Data exploration and visualization for the soil metagenomes (DP1.10107.001) NEON study¶

Get study IDs associated with NEON sites using NMDC runtime API¶

Using the study ids, pull out bio sample IDs¶

Clean up results for more usability¶

Summarize metadata by environmental medium¶

Plot locations of geo_loc scaled by number of samples with ph¶

Plot full time series of pH at the six sites with the most biosamples¶

Plot full time series of water content at the six sites with the most biosamples¶