import requests
import pandas as pd
import plotly.express as px
import nmdc_api_utilities

from nmdc_api_utilities.study_search import StudySearch
# Create a StudySearch object
ss_client = StudySearch(env=ENV)
# Get the study by title
study_json = ss_client.get_record_by_attribute(attribute_name="title", attribute_value="Bio-Scales")
# Get the study id from the study_json
study = study_json[0]["id"]
print(study)

nmdc:sty-11-r2h77870

from nmdc_api_utilities.biosample_search import BiosampleSearch
# create a BiosampleSearch object
bs_client = BiosampleSearch(env=ENV)
per_page = 2000 
fields = "ph,calcium,magnesium,potassium,tot_nitro,manganese,zinc,ammonium_nitrogen,nitrate_nitrogen,nitrite_nitrogen,ecosystem_subtype,habitat"
# get the biosample by associated study, setting all_pages to True to ensure we get all biosamples with the given study
biosample_json = bs_client.get_record_by_attribute(attribute_name="associated_studies", attribute_value=study, fields=fields, max_page_size=per_page, all_pages=True)
print(f"Total number of biosamples: {len(biosample_json)}")
# find biosample nmdc:bsm-11-08mamh62
for biosample in biosample_json:
    if biosample["id"] == "nmdc:bsm-11-08mamh62":
        print(biosample)
        break

Total number of biosamples: 416
{'id': 'nmdc:bsm-11-08mamh62', 'ecosystem_subtype': 'Botanical garden', 'habitat': 'Soil', 'ph': 6.64, 'calcium': {'has_raw_value': '1945.98 mg/kg', 'has_numeric_value': 1945.98, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'magnesium': {'has_raw_value': '351.875 mg/kg', 'has_numeric_value': 351.875, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'potassium': {'has_raw_value': '304.965 mg/kg', 'has_numeric_value': 304.965, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'tot_nitro': {'has_raw_value': '0.242 Percent', 'has_numeric_value': 0.242, 'has_unit': '%', 'type': 'nmdc:QuantityValue'}, 'manganese': {'has_raw_value': '17.688 mg/kg', 'has_numeric_value': 17.688, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'zinc': {'has_raw_value': '2.6576 mg/kg', 'has_numeric_value': 2.6576, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'ammonium_nitrogen': {'has_raw_value': '1.8015 mg/kg', 'has_numeric_value': 1.8015, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'nitrate_nitrogen': {'has_raw_value': '1.698 mg/kg', 'has_numeric_value': 1.698, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'nitrite_nitrogen': {'has_raw_value': '0 mg/kg', 'has_numeric_value': 0.0, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}}

# convert string of fields from request above to dictionary with values set to 0
fields_list = fields.split(',')
field_counts = {field: 0 for field in fields_list}

# Loop through the list of fields and the results to count the presence of each field
for field in field_counts.keys():
    for samp in biosample_json:
        if field in samp:
            field_counts[field] += 1
            
print(field_counts)

{'ph': 103, 'calcium': 103, 'magnesium': 103, 'potassium': 103, 'tot_nitro': 103, 'manganese': 103, 'zinc': 103, 'ammonium_nitrogen': 103, 'nitrate_nitrogen': 103, 'nitrite_nitrogen': 103, 'ecosystem_subtype': 416, 'habitat': 416}

filtered_results = [biosamp for biosamp in biosample_json if all(field in biosamp for field in fields_list)]
print(f"Total results after filtering for all fields: {len(filtered_results)}")
print(filtered_results[2])

Total results after filtering for all fields: 103
{'id': 'nmdc:bsm-11-bdn1fa14', 'ecosystem_subtype': 'Botanical garden', 'habitat': 'Soil', 'ph': 6.23, 'calcium': {'has_raw_value': '2596 mg/kg', 'has_numeric_value': 2596.0, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'magnesium': {'has_raw_value': '456.241 mg/kg', 'has_numeric_value': 456.241, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'potassium': {'has_raw_value': '154.895 mg/kg', 'has_numeric_value': 154.895, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'tot_nitro': {'has_raw_value': '0.172 Percent', 'has_numeric_value': 0.172, 'has_unit': '%', 'type': 'nmdc:QuantityValue'}, 'manganese': {'has_raw_value': '25.9704 mg/kg', 'has_numeric_value': 25.9704, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'zinc': {'has_raw_value': '2.112 mg/kg', 'has_numeric_value': 2.112, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'ammonium_nitrogen': {'has_raw_value': '4.844 mg/kg', 'has_numeric_value': 4.844, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'nitrate_nitrogen': {'has_raw_value': '2.1 mg/kg', 'has_numeric_value': 2.1, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}, 'nitrite_nitrogen': {'has_raw_value': '0.4245 mg/kg', 'has_numeric_value': 0.4245, 'has_unit': 'mg/kg', 'type': 'nmdc:QuantityValue'}}

# Make list of all fields with "has_raw_value" sub-field and remove the rest
other_fields = ["ecosystem_subtype", "habitat", "id", "ph"]
raw_value_fields = list(filter(lambda field: field not in other_fields, fields_list))

# create dictionary of raw_value_fields with values as empty lists
units = {field: [] for field in raw_value_fields}

# Add filtered results to a new list of dictionaries with desired data types, extract out units into a separate dictionary 
df_inp = []
for biosamp in filtered_results:
    rec = {}
    for field in raw_value_fields:
        rec[field] = float(biosamp[field]["has_numeric_value"])
        units[field].append(biosamp[field]["has_unit"])
    for other_field in other_fields:
        if other_field == "ph":
            rec[other_field] = float(biosamp[other_field])
        else:
            rec[other_field] = biosamp[other_field]
    df_inp.append(rec)

# Convert list of results dictionaries to a data frame
df = pd.DataFrame(df_inp)
df

units = {field: set(unit_list) for field, unit_list in units.items()}
print(f"Units for each applicable measurement: {units}")

Units for each applicable measurement: {'calcium': {'mg/kg'}, 'magnesium': {'mg/kg'}, 'potassium': {'mg/kg'}, 'tot_nitro': {'%'}, 'manganese': {'mg/kg'}, 'zinc': {'mg/kg'}, 'ammonium_nitrogen': {'mg/kg'}, 'nitrate_nitrogen': {'mg/kg'}, 'nitrite_nitrogen': {'mg/kg'}}

# Look at potassium vs. ph
fig = px.scatter(df, x="potassium", y="ph", trendline = "ols")
fig.show()

fig = px.scatter(df, x="ammonium_nitrogen", y="nitrate_nitrogen", trendline = "ols", log_x=True)
fig.show()

# Add shortened names of elements and applicable units to a label_mapping dictionary
label_mapping = {"calcium": "Ca", "magnesium": "Mg", "manganese": "Mn", "zinc": "Zn", "potassium": "K"}
for elem, unit in units.items():
    if elem in label_mapping:
        label_mapping[elem] = label_mapping[elem] + " " + (str(unit)).replace("{'","(").replace("'}", ")")


fig = px.scatter_matrix(df,
    dimensions = ["calcium", "magnesium", "manganese", "calcium", "zinc", "potassium"],
    title = "Scatter matrix of Bioscales' biogeochemicals",
    labels = label_mapping,
    color = "ecosystem_subtype",
    width = 800,
    height = 800)
fig.update_traces(diagonal_visible=False)
fig.show()

	calcium	magnesium	potassium	tot_nitro	manganese	zinc	ammonium_nitrogen	nitrate_nitrogen	nitrite_nitrogen	ecosystem_subtype	habitat	id	ph
0	2774.35	578.148	168.200	0.598	48.0908	13.1545	14.5245	20.4010	0.0000	Botanical garden	Soil	nmdc:bsm-11-c9458s26	5.41
1	2511.03	502.777	315.919	0.131	26.6171	4.0774	2.6825	0.0000	0.0000	Botanical garden	Soil	nmdc:bsm-11-r7rgv593	6.95
2	2596.00	456.241	154.895	0.172	25.9704	2.1120	4.8440	2.1000	0.4245	Botanical garden	Soil	nmdc:bsm-11-bdn1fa14	6.23
3	1841.93	331.500	113.561	0.470	34.8639	5.2868	9.5715	12.8740	0.0000	Botanical garden	Soil	nmdc:bsm-11-ftte8s50	5.14
4	2320.83	458.940	489.767	0.326	20.1285	2.6380	2.3290	2.8040	0.0000	Botanical garden	Soil	nmdc:bsm-11-01teww33	6.81
...	...	...	...	...	...	...	...	...	...	...	...	...	...
98	1846.09	291.535	168.937	0.513	23.2667	5.1776	16.3295	14.3950	0.0000	Botanical garden	Soil	nmdc:bsm-11-nk3r8t63	4.84
99	2449.89	450.242	255.944	0.374	26.1187	4.4023	3.9165	2.0535	0.0000	Botanical garden	Soil	nmdc:bsm-11-2mb94m91	6.37
100	2059.81	356.661	192.340	0.619	26.1871	4.9097	14.0990	5.7905	0.0000	Botanical garden	Soil	nmdc:bsm-11-9k1chb38	5.03
101	1764.86	312.300	222.738	0.250	14.6228	3.4332	2.1955	0.8360	0.0000	Botanical garden	Soil	nmdc:bsm-11-ewmanm69	6.48
102	2032.06	305.976	268.098	0.417	43.5453	4.2006	10.4270	10.2510	0.0000	Botanical garden	Soil	nmdc:bsm-11-ygtdv867	5.00

Bio-scales metadata biogeochemical exploration and visualization¶

Import Python libraries¶

Get the study ID for the Bio-Scales study¶

Get the biosamples associated with the Bio-Scales study¶

Explore and understand the results¶

Drop the rows missing biogeochemical fields¶

Transform results¶

Understand the units¶

Visualize potassium and pH¶

Scatter plot of ammonium nitrogen vs. nitrate nitrogen¶

Scatter matrix of all the biogeochemicals¶