Get NMDC Metadata and Data Objects¶
This notebook describes and provides example code to
- Filter NMDC metadata to obtain IDs and fetch attributes, using API endpoints.
- Download collected metadata to files, and data objects to files.
- Fetch and load collected metadata, or data object bytes, to in-memory Python objects.
Dependencies¶
The following modules, constants, and helper functions are used by one or more use case cells below, so be sure to run this cell first:
In [1]:
Copied!
from io import BytesIO
import json
from operator import itemgetter
from pathlib import Path
from pprint import pprint
import shutil
import subprocess
from tqdm.notebook import tqdm
from urllib.parse import parse_qsl, urlencode
import requests
from toolz import keyfilter, merge, concat
HOST = "https://api.microbiomedata.org"
def get_json(path, host=HOST, **kwargs):
r = requests.get(host + path, **kwargs)
r.raise_for_status()
return r.json()
def pick(allowlist, d):
return keyfilter(lambda k: k in allowlist, d)
meta = itemgetter("meta")
results = itemgetter("results")
from io import BytesIO
import json
from operator import itemgetter
from pathlib import Path
from pprint import pprint
import shutil
import subprocess
from tqdm.notebook import tqdm
from urllib.parse import parse_qsl, urlencode
import requests
from toolz import keyfilter, merge, concat
HOST = "https://api.microbiomedata.org"
def get_json(path, host=HOST, **kwargs):
r = requests.get(host + path, **kwargs)
r.raise_for_status()
return r.json()
def pick(allowlist, d):
return keyfilter(lambda k: k in allowlist, d)
meta = itemgetter("meta")
results = itemgetter("results")
Filter/fetch metadata¶
Use Case: fetch metadata directly associated with an ID with unknown type¶
In [2]:
Copied!
get_json("/nmdcschema/ids/nmdc:f2a40483485c45baaf30160d0ca2ac40")
get_json("/nmdcschema/ids/nmdc:f2a40483485c45baaf30160d0ca2ac40")
Out[2]:
{'has_output': ['nmdc:bb7a9edac41c31f6d36c34f6bfa7491a'], 'started_at_time': '2021-01-21T23:31:33Z', 'execution_resource': 'EMSL-RZR', 'has_input': ['emsl:output_747989'], 'was_informed_by': 'emsl:747989', 'git_url': 'https://github.com/microbiomedata/enviroMS', 'id': 'nmdc:f2a40483485c45baaf30160d0ca2ac40', 'used': '12T_FTICR_B', 'type': 'nmdc:NomAnalysisActivity', 'ended_at_time': '2021-01-21T23:31:33Z'}
Use case: fetch metadata for an ID from a known nmdc:Database collection.¶
In [3]:
Copied!
get_json("/nmdcschema/biosample_set/gold:Gb0115217")
get_json("/nmdcschema/biosample_set/gold:Gb0115217")
Out[3]:
{'env_local_scale': {'has_raw_value': 'ENVO:01000621'}, 'collection_date': {'has_raw_value': '2014-09-23'}, 'add_date': '2015-05-28', 'geo_loc_name': {'has_raw_value': 'USA: Columbia River, Washington'}, 'location': 'groundwater-surface water interaction zone in Washington, USA', 'mod_date': '2021-06-17', 'description': 'Sterilized sand packs were incubated back in the ground and collected at time point T2.', 'depth': {'has_raw_value': '0.5', 'has_numeric_value': 0.5, 'has_unit': 'meter'}, 'part_of': ['gold:Gs0114663'], 'ncbi_taxonomy_name': 'sediment metagenome', 'GOLD_sample_identifiers': ['gold:Gb0115217'], 'ecosystem_category': 'Artificial ecosystem', 'ecosystem_type': 'Sand microcosm', 'env_broad_scale': {'has_raw_value': 'ENVO:01000253'}, 'sample_collection_site': 'sand microcosm', 'id': 'gold:Gb0115217', 'identifier': 'GW-RW T2_23-Sept-14', 'ecosystem_subtype': 'Unclassified', 'depth2': {'has_raw_value': '1.0', 'has_numeric_value': 1, 'has_unit': 'meter'}, 'specific_ecosystem': 'Unclassified', 'INSDC_biosample_identifiers': ['biosample:SAMN06343863'], 'community': 'microbial communities', 'name': 'Sand microcosm microbial communities from a hyporheic zone in Columbia River, Washington, USA - GW-RW T2_23-Sept-14', 'alternative_identifiers': ['img.taxon:3300042741'], 'lat_lon': {'has_raw_value': '46.37228379 -119.2717467', 'latitude': 46.37228379, 'longitude': -119.2717467}, 'habitat': 'sand microcosm', 'ecosystem': 'Engineered', 'env_medium': {'has_raw_value': 'ENVO:01000017'}, 'type': 'nmdc:Biosample'}
Use Case: filter metadata from a known nmdc:Database collection using the MongoDB Query Language.¶
In [4]:
Copied!
def get_json_mql(path, filter_):
return get_json(path, params={"filter": json.dumps(filter_)})
def resources_count(json_response):
return len(json_response["resources"])
resources_count(get_json_mql(
"/nmdcschema/biosample_set",
{"ecosystem": "Engineered"}
))
def get_json_mql(path, filter_):
return get_json(path, params={"filter": json.dumps(filter_)})
def resources_count(json_response):
return len(json_response["resources"])
resources_count(get_json_mql(
"/nmdcschema/biosample_set",
{"ecosystem": "Engineered"}
))
Out[4]:
19
Use Case: filter metadata from studies, biosamples, data_objects, or any activities collection using a readable URL with a Solr-like query language.¶
In [5]:
Copied!
def id_and_ecosystem_fields(doc):
return pick(
["id"] + [f for f in doc if f.startswith("ecosystem")],
doc)
print("\nStudies filter:\n")
json_response = get_json("/studies?filter=ecosystem_type:Soil")
pprint(meta(json_response))
pprint([id_and_ecosystem_fields(r) for r in results(json_response)])
print("\nData Objects filter and sort:\n")
json_response = get_json(
"/data_objects?"
"filter=description.search:GFF"
"&"
"sort=file_size_bytes:desc"
)
pprint(meta(json_response))
pprint([pick(
["description", "file_size_bytes", "id", "url"]
, r
) for r in results(json_response)][:5])
print("\nActivities filter and sort:\n")
json_response = get_json(
"/activities?"
"filter=started_at_time:>2022-01-01"
","
"execution_resource.search:NERSC"
"&"
"sort=ended_at_time:desc"
)
pprint(meta(json_response))
pprint([
pick([
"id",
"started_at_time",
"ended_at_time",
"execution_resource",
"type"],
r
) for r in results(json_response)][:5]
)
def id_and_ecosystem_fields(doc):
return pick(
["id"] + [f for f in doc if f.startswith("ecosystem")],
doc)
print("\nStudies filter:\n")
json_response = get_json("/studies?filter=ecosystem_type:Soil")
pprint(meta(json_response))
pprint([id_and_ecosystem_fields(r) for r in results(json_response)])
print("\nData Objects filter and sort:\n")
json_response = get_json(
"/data_objects?"
"filter=description.search:GFF"
"&"
"sort=file_size_bytes:desc"
)
pprint(meta(json_response))
pprint([pick(
["description", "file_size_bytes", "id", "url"]
, r
) for r in results(json_response)][:5])
print("\nActivities filter and sort:\n")
json_response = get_json(
"/activities?"
"filter=started_at_time:>2022-01-01"
","
"execution_resource.search:NERSC"
"&"
"sort=ended_at_time:desc"
)
pprint(meta(json_response))
pprint([
pick([
"id",
"started_at_time",
"ended_at_time",
"execution_resource",
"type"],
r
) for r in results(json_response)][:5]
)
Studies filter: {'count': 3, 'db_response_time_ms': 12, 'mongo_filter_dict': {'ecosystem_type': 'Soil'}, 'mongo_sort_list': None, 'page': 1, 'per_page': 25} [{'ecosystem': 'Environmental', 'ecosystem_category': 'Terrestrial', 'ecosystem_subtype': 'Unclassified', 'ecosystem_type': 'Soil', 'id': 'gold:Gs0128850'}, {'ecosystem': 'Environmental', 'ecosystem_category': 'Terrestrial', 'ecosystem_subtype': 'Meadow', 'ecosystem_type': 'Soil', 'id': 'gold:Gs0135149'}, {'ecosystem': 'Environmental', 'ecosystem_category': 'Terrestrial', 'ecosystem_subtype': 'Unclassified', 'ecosystem_type': 'Soil', 'id': 'gold:Gs0154044'}] Data Objects filter and sort: {'count': 11556, 'db_response_time_ms': 362, 'mongo_filter_dict': {'description': {'$regex': 'GFF'}}, 'mongo_sort_list': [['file_size_bytes', -1]], 'page': 1, 'per_page': 25} [{'description': 'Prodigal GFF file for gold:Gp0208583', 'file_size_bytes': 3013162589, 'id': 'nmdc:ef4512eba3c1bc0a3c99d1ee7a78270b', 'url': 'https://data.microbiomedata.org/data/nmdc:mga06f4615/annotation/nmdc_mga06f4615_prodigal.gff'}, {'description': 'Prodigal GFF file for gold:Gp0116338', 'file_size_bytes': 2898136903, 'id': 'nmdc:14917611a1d1fe3b4f5dc97ae9ffcb86', 'url': 'https://data.microbiomedata.org/data/nmdc:mga0m894/annotation/nmdc_mga0m894_prodigal.gff'}, {'description': 'Prodigal GFF file for gold:Gp0208578', 'file_size_bytes': 2837716826, 'id': 'nmdc:a206b25e328af837ea708d6a7acb4e9f', 'url': 'https://data.microbiomedata.org/data/nmdc:mga0x8zm48/annotation/nmdc_mga0x8zm48_prodigal.gff'}, {'description': 'Prodigal GFF file for gold:Gp0116337', 'file_size_bytes': 2789133841, 'id': 'nmdc:0fba5e13ecb1c51f9298e89d1bfca222', 'url': 'https://data.microbiomedata.org/data/nmdc:mga0kt39/annotation/nmdc_mga0kt39_prodigal.gff'}, {'description': 'Prodigal GFF file for gold:Gp0208581', 'file_size_bytes': 2728019630, 'id': 'nmdc:274b9d4e3f71858c94e129a93f5a1232', 'url': 'https://data.microbiomedata.org/data/nmdc:mga0qfj577/annotation/nmdc_mga0qfj577_prodigal.gff'}] Activities filter and sort: {'count': 2054, 'db_response_time_ms': 2591, 'mongo_filter_dict': {'execution_resource': {'$regex': 'NERSC'}, 'started_at_time': {'$gt': '2022-01-01'}}, 'page': 1, 'per_page': 25} [{'ended_at_time': '2022-05-31T12:31:18-07:00', 'execution_resource': 'NERSC-Cori', 'id': 'nmdc:683c4a7adaae08cf5456f7b80bb6f4d3', 'started_at_time': '2022-05-31T12:31:18-07:00', 'type': 'nmdc:MetatranscriptomeAssembly'}, {'ended_at_time': '2022-05-31T12:29:49-07:00', 'execution_resource': 'NERSC-Cori', 'id': 'nmdc:6305a511f040e8bef679b8a2e439329e', 'started_at_time': '2022-05-31T12:29:49-07:00', 'type': 'nmdc:MetatranscriptomeAssembly'}, {'ended_at_time': '2022-05-31T12:29:02-07:00', 'execution_resource': 'NERSC-Cori', 'id': 'nmdc:17b505f7781a3f0e932e8f39f4190068', 'started_at_time': '2022-05-31T12:29:02-07:00', 'type': 'nmdc:MetatranscriptomeAssembly'}, {'ended_at_time': '2022-05-31T12:28:04-07:00', 'execution_resource': 'NERSC-Cori', 'id': 'nmdc:b502fd974951d11591564592ecff731c', 'started_at_time': '2022-05-31T12:28:04-07:00', 'type': 'nmdc:MetatranscriptomeAssembly'}, {'ended_at_time': '2022-05-31T12:26:24-07:00', 'execution_resource': 'NERSC-Cori', 'id': 'nmdc:839560f9650622f232c262d8cf7a9db9', 'started_at_time': '2022-05-31T12:26:24-07:00', 'type': 'nmdc:MetatranscriptomeAssembly'}]
Download (meta)data¶
Use case: download metadata of all biosamples for study.¶
In [6]:
Copied!
def write_jsonlines_file(path, all_results):
with open(path, "w") as f:
f.writelines([json.dumps(doc)+"\n" for doc in all_results])
cursor = "*"
all_results = []
while cursor is not None:
json_response = get_json(
f"/biosamples?filter=part_of:gold:Gs0110119&cursor={cursor}"
)
m, rs = meta(json_response), results(json_response)
cursor = m['next_cursor']
print("fetched", len(rs), f"results out of {m['count']} total")
all_results.extend(rs)
path = "~/biosamples_part_of_gold:Gs0110119.jsonl"
write_jsonlines_file(
Path(path).expanduser(),
all_results
)
subprocess.check_output(
f"head -1 {path}",
shell=True,
)
def write_jsonlines_file(path, all_results):
with open(path, "w") as f:
f.writelines([json.dumps(doc)+"\n" for doc in all_results])
cursor = "*"
all_results = []
while cursor is not None:
json_response = get_json(
f"/biosamples?filter=part_of:gold:Gs0110119&cursor={cursor}"
)
m, rs = meta(json_response), results(json_response)
cursor = m['next_cursor']
print("fetched", len(rs), f"results out of {m['count']} total")
all_results.extend(rs)
path = "~/biosamples_part_of_gold:Gs0110119.jsonl"
write_jsonlines_file(
Path(path).expanduser(),
all_results
)
subprocess.check_output(
f"head -1 {path}",
shell=True,
)
fetched 25 results out of 60 total fetched 25 results out of 60 total fetched 10 results out of 60 total
Out[6]:
b'{"GOLD_sample_identifiers": ["gold:Gb0110680"], "INSDC_biosample_identifiers": ["BIOSAMPLE:SAMN08902828"], "add_date": "2015-02-26", "collection_date": {"has_raw_value": "2014-09-03"}, "depth": {"has_maximum_numeric_value": 0.2, "has_minimum_numeric_value": 0.1, "has_numeric_value": 0.1, "has_raw_value": "0.1 to 0.2 meters", "has_unit": "metre"}, "depth2": {"has_numeric_value": 0.2, "has_raw_value": "0.2 meters", "has_unit": "metre"}, "description": "Grasslands soil microbial communities from the Angelo Coastal Reserve, plot 2. There is a duplicate submission for this entry in NCBI. The NCBI identifiers for a duplicate are PRJNA449266 and SAMN08902828", "ecosystem": "Environmental", "ecosystem_category": "Terrestrial", "ecosystem_subtype": "Grasslands", "ecosystem_type": "Soil", "elev": {"has_numeric_value": 432, "has_raw_value": "432 meters", "has_unit": "metre"}, "env_broad_scale": {"has_raw_value": "grassland biome [ENVO:01000177]", "term": {"id": "ENVO:01000177", "name": "grassland biome"}}, "env_local_scale": {"has_raw_value": "biosphere reserve [ENVO:00000376]", "term": {"id": "ENVO:00000376", "name": "biosphere reserve"}}, "env_medium": {"has_raw_value": "grassland soil [ENVO:00005750]", "term": {"id": "ENVO:00005750", "name": "grassland soil"}}, "geo_loc_name": {"has_raw_value": "USA: California: Angelo Coastal Reserve"}, "habitat": "Grasslands soil", "id": "gold:Gb0110680", "identifier": "14_0903_02_20cm", "lat_lon": {"has_raw_value": "39.7392 -123.6308", "latitude": 39.7392, "longitude": -123.6308}, "location": "USA: California: Angelo Coastal Reserve", "mod_date": "2022-08-02", "name": "Grasslands soil microbial communities from the Angelo Coastal Reserve, California, USA - 14_0903_02_20cm", "ncbi_taxonomy_name": "soil metagenome", "part_of": ["gold:Gs0110119"], "sample_collection_site": "grassland soil", "specific_ecosystem": "Unclassified"}\n'
Use case: download all data objects for biosample¶
In [7]:
Copied!
def download_file(url, directory="~/"):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(Path(directory + local_filename).expanduser(), 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
id_biosample = "igsn:IEWFS000A"
rs_ompro = results(get_json(f"/activities?filter=type:nmdc:OmicsProcessing,has_input:{id_biosample}"))
for id_ompro in tqdm([d["id"] for d in rs_ompro]):
rs_act = results(get_json(f"/activities?filter=was_informed_by:{id_ompro}"))
for data_object_ids, activity_type in [(d["has_output"], d["type"]) for d in rs_act]:
for data_object_id in data_object_ids:
do = results(get_json(f"/data_objects?filter=id:{data_object_id}"))[0]
print(f'downloading biosample {id_biosample} > omics processing activity {id_ompro} '
f'> {activity_type} activity > data object {data_object_id} from {do["url"]}...')
download_file(do["url"])
def download_file(url, directory="~/"):
local_filename = url.split('/')[-1]
with requests.get(url, stream=True) as r:
with open(Path(directory + local_filename).expanduser(), 'wb') as f:
shutil.copyfileobj(r.raw, f)
return local_filename
id_biosample = "igsn:IEWFS000A"
rs_ompro = results(get_json(f"/activities?filter=type:nmdc:OmicsProcessing,has_input:{id_biosample}"))
for id_ompro in tqdm([d["id"] for d in rs_ompro]):
rs_act = results(get_json(f"/activities?filter=was_informed_by:{id_ompro}"))
for data_object_ids, activity_type in [(d["has_output"], d["type"]) for d in rs_act]:
for data_object_id in data_object_ids:
do = results(get_json(f"/data_objects?filter=id:{data_object_id}"))[0]
print(f'downloading biosample {id_biosample} > omics processing activity {id_ompro} '
f'> {activity_type} activity > data object {data_object_id} from {do["url"]}...')
download_file(do["url"])
0%| | 0/22 [00:00<?, ?it/s]
downloading biosample igsn:IEWFS000A > omics processing activity emsl:705701 > nmdc:NomAnalysisActivity activity > data object nmdc:2a779b0132303d5999c6f7c99915fd34 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134A_CHCl3_15Oct18_IAT_p1_1_01_35922.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:705702 > nmdc:NomAnalysisActivity activity > data object nmdc:9f0d52cc46d247b8d2ba12d5842b9fb6 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134A_H2O_15Oct18_IAT_p1_1_01_35893.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:705703 > nmdc:NomAnalysisActivity activity > data object nmdc:e3449444d03be27addeaca224ce9a3a3 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134A_MeOH_15Oct18_IAT_p1_1_01_35910.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:705704 > nmdc:NomAnalysisActivity activity > data object nmdc:ed4d444af3672b33bf43a1d5b6dd1ca9 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134B_CHCl3_15Oct18_IAT_p1_1_01_35927.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:705705 > nmdc:NomAnalysisActivity activity > data object nmdc:4296e45e09241a4ac76202d4f1f40458 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134B_H2O_15Oct18_IAT_p1_1_01_35898.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:705706 > nmdc:NomAnalysisActivity activity > data object nmdc:031d764e78bd55a9247ee11fcf407587 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134B_H2O_SPE_15Oct18_IAT_p05_1_01_35903.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:705707 > nmdc:NomAnalysisActivity activity > data object nmdc:f2f58fc563ac5836762826b0e6db6f48 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134B_MeOH_15Oct18_IAT_p1_1_01_35915.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:713600 > nmdc:NomAnalysisActivity activity > data object nmdc:d7bda0fca4304e2699eed4be527c81de from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134_r3_H2O_30Nov18_IATp1_1_01_37902.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:715134 > nmdc:NomAnalysisActivity activity > data object nmdc:f905c863ed0369cc3f83a5cbc46561d4 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134_H2O_3_IATp1_07Dec18_1_01_38232.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:717358 > nmdc:NomAnalysisActivity activity > data object nmdc:2d795aad78f1ddba9e3f7add02fc259f from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134_H2O_1_12Dec18_IATp1_1_01_38495.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:717359 > nmdc:NomAnalysisActivity activity > data object nmdc:ce403bfa29fdce24d6047570f0336c48 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134_H2O_2_12Dec18_IATp1_1_01_38496.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:717360 > nmdc:NomAnalysisActivity activity > data object nmdc:119d7c652e0c29e32c5066cc987b17ff from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134_H2O_3_12Dec18_IATp1_1_01_38497.csv... downloading biosample igsn:IEWFS000A > omics processing activity emsl:717365 > nmdc:NomAnalysisActivity activity > data object nmdc:4b649d353b2c2385ab042682ba516d14 from https://nmdcdemo.emsl.pnnl.gov/nom/results/Brodie_134_H2O_1_13Dec18_IATp15_1_01_38565.csv...
Load (meta)data objects to in-memory Python objects¶
Use case: load metadata of all biosamples for study.¶
In [8]:
Copied!
cursor = "*"
all_results = []
while cursor is not None:
json_response = get_json(
f"/biosamples?filter=part_of:gold:Gs0110119&cursor={cursor}"
)
m, rs = meta(json_response), results(json_response)
cursor = m['next_cursor']
print("fetched", len(rs), f"results out of {m['count']} total")
all_results.extend(rs)
pprint([pick(["id","lat_lon"], r) for r in all_results][:5])
cursor = "*"
all_results = []
while cursor is not None:
json_response = get_json(
f"/biosamples?filter=part_of:gold:Gs0110119&cursor={cursor}"
)
m, rs = meta(json_response), results(json_response)
cursor = m['next_cursor']
print("fetched", len(rs), f"results out of {m['count']} total")
all_results.extend(rs)
pprint([pick(["id","lat_lon"], r) for r in all_results][:5])
fetched 25 results out of 60 total fetched 25 results out of 60 total fetched 10 results out of 60 total [{'id': 'gold:Gb0110680', 'lat_lon': {'has_raw_value': '39.7392 -123.6308', 'latitude': 39.7392, 'longitude': -123.6308}}, {'id': 'gold:Gb0110681', 'lat_lon': {'has_raw_value': '39.7392 -123.6308', 'latitude': 39.7392, 'longitude': -123.6308}}, {'id': 'gold:Gb0110682', 'lat_lon': {'has_raw_value': '39.7392 -123.6308', 'latitude': 39.7392, 'longitude': -123.6308}}, {'id': 'gold:Gb0110683', 'lat_lon': {'has_raw_value': '39.7392 -123.6308', 'latitude': 39.7392, 'longitude': -123.6308}}, {'id': 'gold:Gb0110684', 'lat_lon': {'has_raw_value': '39.7392 -123.6308', 'latitude': 39.7392, 'longitude': -123.6308}}]
Use case: load data object¶
In [9]:
Copied!
def load_bytes(url):
with requests.get(url, stream=True) as r:
b = BytesIO()
shutil.copyfileobj(r.raw, b)
return b.getvalue()
b = load_bytes(get_json("/nmdcschema/data_object_set/nmdc:4b649d353b2c2385ab042682ba516d14")["url"])
for line in b.decode('utf-8').split("\n"):
print(line)
def load_bytes(url):
with requests.get(url, stream=True) as r:
b = BytesIO()
shutil.copyfileobj(r.raw, b)
return b.getvalue()
b = load_bytes(get_json("/nmdcschema/data_object_set/nmdc:4b649d353b2c2385ab042682ba516d14")["url"])
for line in b.decode('utf-8').split("\n"):
print(line)
Index,m/z,Calibrated m/z,Calculated m/z,Peak Height,Resolving Power,S/N,Ion Charge,m/z Error (ppm),m/z Error Score,Isotopologue Similarity,Confidence Score,DBE,H/C,O/C,Heteroatom Class,Ion Type,Is Isotopologue,Mono Isotopic Index,Molecular Formula,C,H,O,S,N 3,888.4887543119316,888.4883146532127,888.4878482725782,3085144.7556615113,115985.22008880168,8.926135177919196,-1,-0.5249150401183273,0.21637236558431258,0.0,0.12982341935058755,21.0,1.290909090909091,0.12727272727272726,N1 S1 O7,de-protonated,0,,C55 H71 O7 S1 N1,55,71,7,1,1 4,885.3918562712611,885.3914165190189,885.3914239992979,3271768.2235989384,116390.47009957765,9.466085953040722,-1,0.008448555979629386,0.999603533623901,0.0,0.5997621201743406,16.0,1.3478260869565217,0.3695652173913043,O17,de-protonated,0,,C46 H62 O17,46,62,17,, 6,877.03548957106,877.0350496100808,877.035237742867,3413803.0920212218,93999.77607505002,9.877030182866147,-1,0.2145099513797522,0.7744236389320375,0.0,0.4646541833592225,32.0,0.5238095238095238,0.47619047619047616,S1 O20,de-protonated,0,,C42 H22 O20 S1,42,22,20,1, 7,871.5868128663766,871.5863728035725,871.5857118408668,2525810.108505442,118234.47009250808,7.307832937466206,-1,-0.758345044867053,0.04096943704460999,0.0,0.024581662226765992,23.0,1.2903225806451613,0.016129032258064516,S1 O1,de-protonated,0,,C62 H80 O1 S1,62,80,1,1, 8,869.7066517807076,869.7062116890468,869.7062208994556,2533183.851969944,118489.97010116033,7.329167116619603,-1,0.010590252923018975,0.999377119309806,0.0,0.5996262715858836,7.0,1.7818181818181817,0.09090909090909091,S1 O5,de-protonated,0,,C55 H98 O5 S1,55,98,5,1, 11,862.1881537094312,862.18771353458,862.18698735984,2500237.533314158,95618.37607241525,7.233844751794849,-1,-0.8422473901817746,0.019428704336239354,0.0,0.011657222601743612,19.0,1.0789473684210527,0.5263157894736842,N1 S1 O20,de-protonated,0,,C38 H41 O20 S1 N1,38,41,20,1,1 12,841.990360022908,841.9899198796957,841.9907299023058,2370457.287293944,97912.17607420433,6.858356367570994,-1,0.9620326939315207,0.005847710725539058,0.0,0.003508626435323435,35.0,0.325,0.525,N1 O21,de-protonated,0,,C40 H13 O21 N1,40,13,21,,1 13,838.2477426554628,838.2473025589466,838.2480119249798,2449834.394824783,122936.47008928134,7.088015216009881,-1,0.846248393230588,0.018713029385901064,0.0,0.011227817631540639,33.0,0.7884615384615384,0.15384615384615385,N1 S1 O8,de-protonated,0,,C52 H41 O8 S1 N1,52,41,8,1,1 14,830.2707204394443,830.2702804850616,830.2699291394654,2775378.1338916654,82745.31339509624,8.029898871842809,-1,-0.423170325523773,0.3697786516541105,0.0,0.2218671909924663,17.0,1.225,0.4,N1 S1 O16,de-protonated,0,,C40 H49 O16 S1 N1,40,49,16,1,1 15,820.3453445049055,820.3449048083498,820.3455960048715,2743261.455279875,100495.57607049527,7.936976873862471,-1,0.8425674801075833,0.019370581727126596,0.0,0.011622349036275957,5.0,1.7878787878787878,0.6666666666666666,N1 O22,de-protonated,0,,C33 H59 O22 N1,33,59,22,,1 17,799.3685325405778,799.3680936843464,799.367398784561,3118598.9451273754,103132.77607188243,9.022926946568637,-1,-0.8693121415769531,0.015020289920773781,0.0,0.009012173952464269,23.0,1.12,0.14,S1 O7,de-protonated,0,,C50 H56 O7 S1,50,56,7,1, 18,780.4733701715519,780.4729324155969,780.4725921312933,2463579.1177946297,105629.57606904797,7.127782314453726,-1,-0.43599776224277864,0.3478172613374125,0.0,0.2086903568024475,8.0,1.6904761904761905,0.23809523809523808,N1 S1 O10,de-protonated,0,,C42 H71 O10 S1 N1,42,71,10,1,1 22,653.5153865174419,653.5149645932129,653.5150488223496,2773339.3701903233,157687.97009893385,8.024000192255532,-1,0.12888630016707295,0.9118433198656568,0.0,0.547105991919394,8.0,1.6666666666666667,0.11904761904761904,O5,de-protonated,0,,C42 H70 O5,42,70,5,, 35,450.26572507976397,450.2653590871265,450.2649821433228,21202835.25855894,228867.72008897978,61.345378794865574,-1,-0.8371599361487543,0.010186573708662465,0.766236582456902,0.31260657720795826,11.0,1.3214285714285714,0.14285714285714285,N1 O4,de-protonated,0,,C28 H37 O4 N1,28,37,4,,1 37,283.2645638441444,283.2642719869493,283.2642538675084,4273299.506825067,291039.1760681066,12.36377936032282,-1,-0.06396656369622783,0.9775246355422157,0.0,0.5865147813253294,1.0,2.0,0.1111111111111111,O2,de-protonated,0,,C18 H36 O2,18,36,2,, 38,273.0618794463733,273.06159294231406,273.06159087568057,2432007.4278552043,377392.22020885994,7.036437112035749,-1,-0.007568378593162039,0.9996818264361175,0.0,0.5998090958616704,5.0,1.2727272727272727,0.7272727272727273,O8,de-protonated,0,,C11 H14 O8,11,14,8,, 39,255.23321441074,255.2329374887667,255.23295373855242,6820572.259571557,323002.9760412817,19.73370937230056,-1,0.06366648772467076,0.9777326510838139,0.0,0.5866395906502884,1.0,2.0,0.125,O2,de-protonated,0,,C16 H32 O2,16,32,2,, 0,893.9807192795649,893.9802798081946,,2925279.2994384016,92217.97607563925,8.463602368102553,-1,,,,,,,,unassigned,,,,,,,,, 1,891.2401722007025,891.239732632407,,3414246.805595051,92501.5760653175,9.87831396293287,-1,,,,,,,,unassigned,,,,,,,,, 2,889.1174652085989,889.1170255699338,,2955400.261042048,77268.8133940652,8.550750231900409,-1,,,,,,,,unassigned,,,,,,,,, 5,884.5134572608324,884.5130174836548,,5254152.640152923,116506.4700934881,15.201645441550752,-1,,,,,,,,unassigned,,,,,,,,, 9,864.2012204090391,864.2007802514113,,3030984.065495402,119244.47008147289,8.769434056889162,-1,,,,,,,,unassigned,,,,,,,,, 10,862.8953391929115,862.89489902369,,3055128.0467927232,119424.97008838368,8.83928894470198,-1,,,,,,,,unassigned,,,,,,,,, 16,803.9459525978525,803.9455135240407,,5601357.72374313,170909.29346210908,16.20620106407045,-1,,,,,,,,unassigned,,,,,,,,, 19,752.2242722248513,752.223836720794,,3273358.7560336525,136995.72010069533,9.470687781687843,-1,,,,,,,,unassigned,,,,,,,,, 20,750.7742477178517,750.7738123490101,,2904412.8971422175,137259.97009147579,8.40323037835044,-1,,,,,,,,unassigned,,,,,,,,, 21,653.5198929566078,653.519471031556,,3485404.0251306985,210248.96017618917,10.084190513553223,-1,,,,,,,,unassigned,,,,,,,,, 23,619.4881452383327,619.4877300537637,,6028785.5745970225,95057.26862897968,17.442862251046506,-1,,,,,,,,unassigned,,,,,,,,, 24,619.4845026976315,619.4840875138403,,6626103.056320465,221800.29347879274,19.171058821471128,-1,,,,,,,,unassigned,,,,,,,,, 25,618.4823634993785,618.4819485300792,,67607054.28433818,166619.97011002325,195.60498884712098,-1,,,,,,,,unassigned,,,,,,,,, 26,605.4715778756023,605.4711657741412,,7625159.61822322,226933.62680355,22.06159220910761,-1,,,,,,,,unassigned,,,,,,,,, 27,605.4681309396186,605.4677188389377,,6698551.287281573,97257.26863065724,19.380670607684152,-1,,,,,,,,unassigned,,,,,,,,, 28,604.4669880341766,604.4665761605637,,90288493.75827102,170483.22009523786,261.2283585131924,-1,,,,,,,,unassigned,,,,,,,,, 29,591.4558841306729,591.4554752911807,,3570435.8944568695,232311.29346066722,10.330210075080675,-1,,,,,,,,unassigned,,,,,,,,, 30,591.4525206690176,591.4521118303297,,4196033.12055477,232312.62681350167,12.140227383614969,-1,,,,,,,,unassigned,,,,,,,,, 31,591.4362071597824,591.4357983249955,,4325198.308595796,99565.84004350418,12.513936243343444,-1,,,,,,,,unassigned,,,,,,,,, 32,590.4511390833583,590.450730484585,,33051017.328779608,174529.72008540362,95.62528562170476,-1,,,,,,,,unassigned,,,,,,,,, 33,574.4557295997837,574.4553249571502,,57270445.86539885,179389.47009937343,165.6984621406013,-1,,,,,,,,unassigned,,,,,,,,, 34,509.29372586837684,509.2933397523243,,3016102.9919072534,115624.41149005273,8.726379197243881,-1,,,,,,,,unassigned,,,,,,,,, 36,311.1007284631918,311.10042248349583,,2602658.391103224,220831.98006834995,7.530175230287296,-1,,,,,,,,unassigned,,,,,,,,,
In [ ]:
Copied!