import requests
import json
import pandas as pd
import folium
import altair as alt
from altair import Chart
from datetime import datetime
import nmdc_api_utilities

keyword="DP1.10107.001"
## We import and instantiate a client that is able to search studies
from nmdc_api_utilities.study_search import StudySearch
Study_search_client = StudySearch(env=ENV)
## We know the name of the study contains "DP1.10107.001", but probably other characters, so we search it as part of the whole name with "$regex".
## The function "get_records" would let us apply this filter using MongoDB-like language querying. 
# query_results = Study_search_client.get_records(filter=f'{{"name": {{"$regex":"{keyword}"}}}}')
## But we can also use "get_record_by_attribute", which also search using a regular expression 
print(f"Searching the collection of studies, looking for a study including {keyword} in its name")
query_results = Study_search_client.get_record_by_attribute("name",keyword) 
## We expect a single study to be returned, but we verify that indeed we got a single one in our result list.
n_results=len(query_results)
print(f"We got {n_results} study(ies) matching our query")
## We select our study, and the store its id in study_id
study = query_results[0]
study_id = study["id"]
print(f"The corresponding study_id is {study_id}")

Searching the collection of studies, looking for a study including DP1.10107.001 in its name

We got 1 study(ies) matching our query
The corresponding study_id is nmdc:sty-11-34xj1150

## As done for the study search, we instantiate a client to search biosamples, and from there look for all samples with the corresponding study_id
from nmdc_api_utilities.biosample_search import BiosampleSearch
Sample_search_client = BiosampleSearch(env=ENV)
print(f"Searching the collection of biosamples, looking for biosamples associated with study {study_id}")
## When running the search, we specify "max_page_size=500" and "all_pages=True" to get all results in a timely manner (the default would return only the first 20)
all_results = Sample_search_client.get_record_by_attribute("associated_studies",study_id,max_page_size=500,all_pages=True,exact_match=True) 
## This is a relatively big query, so it may take ~ 1 minute to return
print(f"We found {len(all_results)} samples linked to study {study_id}")
## This return is a list of sample attributes, which we will further explore below

Searching the collection of biosamples, looking for biosamples associated with study nmdc:sty-11-34xj1150

We found 4475 samples linked to study nmdc:sty-11-34xj1150

# Find total number of NEON coordinates
# Find the total number of geo locations (equivalent to NEON sites)
# Find the total  number of collection dates
coordinates = set()
geo_locs = set()
dates = set()
all_full_results = []
for samp in all_results:
    # Check if samp has keys that correspond to primary metadata
    if set(['lat_lon', 'geo_loc_name', 'collection_date']).issubset(samp):
        lat_lon = f"{samp['lat_lon']['latitude']},{samp['lat_lon']['longitude']}"
        coordinates.add(lat_lon)
        geo_locs.add(samp["geo_loc_name"]["has_raw_value"])
        dates.add(samp["collection_date"]["has_raw_value"])
        all_full_results.append(samp)        
print(f"Number of NEON coordinates: {len(coordinates)}")
print(f"Number of geo locations (sites): {len(geo_locs)}")
print(f"Number of dates: {len(dates)}")
print(f"Total number of biosamples with sufficient metadata: {len(all_full_results)}")

Number of NEON coordinates: 495
Number of geo locations (sites): 47
Number of dates: 4242
Total number of biosamples with sufficient metadata: 4475

print(all_full_results[0])

{'analysis_type': ['metagenomics'], 'carb_nitro_ratio': {'has_numeric_value': 25.4, 'type': 'nmdc:QuantityValue', 'has_unit': '1'}, 'collection_date': {'has_raw_value': '2016-07-26T01:30Z', 'type': 'nmdc:TimestampValue'}, 'depth': {'has_maximum_numeric_value': 0.325, 'has_minimum_numeric_value': 0, 'has_unit': 'm', 'type': 'nmdc:QuantityValue'}, 'elev': 677.6, 'env_broad_scale': {'term': {'id': 'ENVO:00000446', 'name': 'terrestrial biome', 'type': 'nmdc:OntologyClass'}, 'type': 'nmdc:ControlledIdentifiedTermValue'}, 'env_local_scale': {'term': {'id': 'ENVO:01000861', 'name': 'area of dwarf scrub', 'type': 'nmdc:OntologyClass'}, 'type': 'nmdc:ControlledIdentifiedTermValue'}, 'env_medium': {'term': {'id': 'ENVO:00001998', 'name': 'soil', 'type': 'nmdc:OntologyClass'}, 'type': 'nmdc:ControlledIdentifiedTermValue'}, 'env_package': {'has_raw_value': 'soil', 'type': 'nmdc:TextValue'}, 'id': 'nmdc:bsm-11-002vgm56', 'name': 'HEAL_048-O-6.5-19.5-20160725', 'nitro': {'has_numeric_value': 1.02, 'has_unit': '%', 'type': 'nmdc:QuantityValue'}, 'org_carb': {'has_numeric_value': 25.94, 'has_unit': '%', 'type': 'nmdc:QuantityValue'}, 'ph': 6.04, 'samp_collec_device': 'corer', 'soil_horizon': 'O horizon', 'temp': {'has_numeric_value': 6.6, 'has_unit': 'Cel', 'type': 'nmdc:QuantityValue'}, 'type': 'nmdc:Biosample', 'water_content': ['2.667 g of water/g of dry soil'], 'geo_loc_name': {'has_raw_value': 'USA: Alaska, Healy', 'type': 'nmdc:TextValue'}, 'biosample_categories': ['NEON'], 'lat_lon': {'latitude': 63.875088, 'longitude': -149.210438, 'type': 'nmdc:GeolocationValue'}, 'gold_biosample_identifiers': ['gold:Gb0255739'], 'ecosystem': 'Environmental', 'ecosystem_category': 'Terrestrial', 'ecosystem_type': 'Soil', 'ecosystem_subtype': 'Unclassified', 'associated_studies': ['nmdc:sty-11-34xj1150'], 'specific_ecosystem': 'Unclassified', 'img_identifiers': ['img.taxon:3300039974']}

# Convert results to dataframes
# Transform results to desired format and convert to a data frame
df_inp = []
water_content_units = "g of water/g of dry soil"
for biosamp in all_full_results:

    # Get only month, day, and year from collection_date (remove times)
    date = datetime.strptime(biosamp["collection_date"]["has_raw_value"],"%Y-%m-%dT%H:%MZ")
    date = date.strftime("%Y-%m-%d")
    
    # Extract out units of water_content and convert to float
    if 'water_content' in biosamp:
        water_content = float("".join(biosamp["water_content"]).replace(water_content_units,""))

    # Convert ph to a float
    if "ph" in biosamp:
        ph = float(biosamp["ph"])
    else:
        ph = None

    rec = {"id": biosamp["id"],
           "collection_date": date,
           "soil_horizon": biosamp["soil_horizon"],
           "water_content": water_content,
           "ph": ph,
          "elev": float(biosamp["elev"]),
          "location": biosamp["geo_loc_name"]["has_raw_value"],
          "latitude": biosamp["lat_lon"]["latitude"],
          "longitude": biosamp["lat_lon"]["longitude"]}

    df_inp.append(rec)

df = pd.DataFrame(df_inp)

df

# Find middle coordinates to inform map center
min_lat = df["latitude"].min()
max_lat = df["latitude"].max()

min_lon = df["longitude"].min()
max_lon = df["longitude"].max()

def find_square_midpoint(min_lat, max_lon, max_lat, min_lon):
    # calculate midpoint latitude
    mid_lat = (min_lat + max_lat) / 2
    
    # calculate midpoint longitude
    if abs(max_lon - min_lon) <= 180:
        mid_lon = (min_lon + max_lon) / 2
    else:
        # If the line crosses the 180 degree meridian, adjust the midpoint longitude
        mid_lon = ((max_lon + min_lon + 360) % 360) / 2 - 180
   
    return int(round(mid_lat, 0)), int(round(mid_lon, 0))

mid_coords = find_square_midpoint(min_lat, max_lon, max_lat, min_lon)
print(f"min latitude: {min_lat}")
print(f"max latitude: {max_lat}")
print(f"min longitude: {min_lon}")
print(f"max longitude: {max_lon}")

print(f"midpoint coordintates: {mid_coords}")

min latitude: 17.963818
max latitude: 71.285604
min longitude: -156.648849
max longitude: -66.824633
midpoint coordintates: (45, -112)

## There is a warning coming from one of the sub-package, removing it for clarity
import warnings
warnings.filterwarnings('ignore')

# Create map of NEON sites 
m = folium.Map(
    tiles="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
    attr="Tiles &copy; Esri &mdash; Source: Esri, i-cubed, USDA, USGS, AEX, GeoEye, Getmapping, Aerogrid, IGN, IGP, UPR-EGP, and the GIS User Community",
    location=(mid_coords),
    zoom_start=3,
    control_scale=True)

# group data frames by site (e.g. location)
grouped = df.groupby("location")
result_dfs = {}
for name, group_df in grouped:
    result_dfs[name] = group_df.reset_index(drop=True)

# Add markers to map based on location name (site) - used the mean coordinates for each site 
for name, site_df in result_dfs.items():
    mean_lat = site_df["latitude"].mean()
    mean_lon = site_df["longitude"].mean()

    # Create scatter plot of pH vs. time and add a linear regression
    scatter = Chart(site_df).mark_circle().encode(x="collection_date", y="ph", color="soil_horizon:N")
    chart = scatter.properties(width=600, height=400, title=f'{name}: Change in soil pH over time')

    # Add charts as popup for each NEON site on the map
    vega_lite = folium.VegaLite(chart, width="100%", height="100%")
    marker = folium.Marker(
        location=[mean_lat, mean_lon],
        tooltip="click me!")
    popup = folium.Popup()
    vega_lite.add_to(popup)
    popup.add_to(marker)
    marker.add_to(m)

m

## There is a warning coming from one of the sub-package, removing it for clarity
import warnings
warnings.filterwarnings('ignore')

# How does the average water_content change with the seasons per site?
df["collection_date"] = pd.to_datetime(df["collection_date"])

df["month"] = df["collection_date"].dt.month

def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"

df["season"] = df["month"].apply(get_season)

bar = Chart(df).mark_bar().encode(x="location", y=alt.Y('mean(water_content)', title=f"mean water_content in {water_content_units}"),color="season",xOffset="season")
error_bar = Chart(df).mark_errorbar().encode(x="location",xOffset="season", y=alt.Y('mean(water_content)'))
chart = bar.properties(width=1000, height=400, title=f'Change in average water_content of NEON sites by season') + error_bar
chart

	id	collection_date	soil_horizon	water_content	ph	elev	location	latitude	longitude
0	nmdc:bsm-11-002vgm56	2016-07-26	O horizon	2.667	6.04	677.6	USA: Alaska, Healy	63.875088	-149.210438
1	nmdc:bsm-11-00dkyf35	2019-03-13	M horizon	0.113	6.65	381.8	USA: California, San Joaquin Experimental Range	37.110011	-119.735218
2	nmdc:bsm-11-00hrxp98	2016-08-03	O horizon	0.992	3.90	199.7	USA: Massachusetts, Harvard Forest & Quabbin W...	42.427091	-72.229737
3	nmdc:bsm-11-00m15h97	2020-06-23	M horizon	0.032	7.07	1649.3	USA: Colorado, Central Plains Experimental Range	40.818371	-104.746715
4	nmdc:bsm-11-00yhef97	2016-07-26	M horizon	0.032	6.47	44.8	USA: Georgia, The Jones Center At Ichauway	31.189774	-84.465861
...	...	...	...	...	...	...	...	...	...
4470	nmdc:bsm-11-zy2p7j24	2018-07-17	M horizon	0.302	7.01	405.7	USA: Kansas, Konza Prairie Biological Station	39.103068	-96.563925
4471	nmdc:bsm-11-zyh2rm11	2020-07-28	M horizon	0.399	6.24	405.4	USA: Kansas, Konza Prairie Biological Station	39.102192	-96.561180
4472	nmdc:bsm-11-zyhk8g66	2017-06-21	M horizon	0.306	7.15	272.5	USA: Tennessee, Oak Ridge	35.957646	-84.261838
4473	nmdc:bsm-11-zzdpcm17	2020-06-03	M horizon	0.283	5.07	260.1	USA: Tennessee, Oak Ridge	35.965875	-84.230646
4474	nmdc:bsm-11-zzrx6210	2018-06-20	M horizon	0.082	5.30	120.8	USA: Alabama, Talladega National Forest	32.893534	-87.415333

Data exploration and visualization for the soil metagenomes (DP1.10107.001) NEON study¶

Get the Study ID for the NEON soil metagenome DP1.10107.001 project¶

Get all the biosamples associated with the NEON soil metagenomes study by the NMDC study ID¶

Geo-location and collection date exploration¶

Print a returned result¶

Convert results to a data frame and transform into desired format¶

Find min/max and mid coordinates of the latitude and longitude coordinates¶

Create an interactive map of the NEON sites with scatter plots of ph vs. time¶

Bar chart of the average water_content of each site and the season¶