# Define a function to split ids into chunks
def split_list(input_list, chunk_size=100):
    result = []
    
    for i in range(0, len(input_list), chunk_size):
        result.append(input_list[i:i + chunk_size])
        
    return result

import requests
from io import StringIO
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sys
from IPython.display import Image, display
import nmdc_api_utilities

if 'google.colab' in sys.modules:

   #module in this folder with specific protein aggregation functions
   !wget https://raw.githubusercontent.com/microbiomedata/nmdc_notebooks/refs/heads/main/proteomic_aggregation/python/aggregation_functions.py
   import aggregation_functions as agg_func

   #pmartR logo
   !wget https://raw.githubusercontent.com/microbiomedata/nmdc_notebooks/refs/heads/main/proteomic_aggregation/pmartR_logo_final.jpg

else:
  #module in this folder with specific protein aggregation functions
  import aggregation_functions as agg_func

pd.set_option("display.max_rows", 6)
from nmdc_api_utilities.data_object_search import DataObjectSearch
do_client = DataObjectSearch(env=ENV)
#get all data objects associated with this study id using the NMDC find endpoint
data = do_client.get_data_objects_for_studies(study_id='nmdc:sty-11-aygzgv51')
data = pd.DataFrame(data)

#reformat data into dataframe (keeping biosample id)
data_objects=[]
for index, row in data.iterrows():
    bio_id = row['biosample_id']
    row_out = pd.json_normalize(row['data_objects'])
    row_out['biosample_id'] = bio_id
    data_objects.append(row_out)

data_objects = pd.concat(data_objects)[['id','name','file_size_bytes','data_object_type','md5_checksum','url','biosample_id','in_manifest','was_generated_by']]
display(data_objects)

del data, index, row, row_out, bio_id

proteomic_output_df = data_objects[data_objects['data_object_type']=='Unfiltered Metaproteomics Results'].reset_index(drop=True).rename(columns={'id':'processed_DO_id'})
#how many result files exist for each biosample?
display(proteomic_output_df.groupby('biosample_id')['data_object_type'].value_counts().reset_index())

#Instantiate instances for nmdc_api_utilities
from nmdc_api_utilities.workflow_execution_search import WorkflowExecutionSearch
from nmdc_api_utilities.data_processing import DataProcessing
# since we are querying the workflow collection, we need to create an instance of it
we_client = WorkflowExecutionSearch(env=ENV)
# create an instance of the data processing object
dp_client = DataProcessing()

# create a list of lists of processed proteomic result ids to query the workflow execution objects
chunked_list = split_list(proteomic_output_df['processed_DO_id'].unique().tolist())
version_output = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the WorkflowExecutionSearch object looking for records
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"has_output": {{"$in": {filter_list}}}}}'
    # get the results
    version_output += we_client.get_record_by_filter(filter=filter,fields="has_output,version,has_input", max_page_size=100, all_pages=True)

version_output = pd.json_normalize(version_output)

version_output = version_output[version_output['version'].str.contains('v1.2.1',na=False)]

version_output

# filter proteomic_output_df results to those from workflow version v1.2.1
workflow_outputs = version_output.drop(columns='has_input',axis=1).rename(columns={'id':'workflow_id','version':'workflow_version'}).explode('has_output').drop_duplicates()
proteomic_output_df = proteomic_output_df.merge(workflow_outputs, right_on = 'has_output',left_on='processed_DO_id').drop('has_output',axis=1)

proteomic_output_df

from nmdc_api_utilities.manifest_search import ManifestSearch
#extract in_manifest ids for this study (in this case a single one available)
manifest_id = proteomic_output_df.explode('in_manifest')['in_manifest'].unique()
# since we are querying the manifest collection, we need to create an instance of it
m_client = ManifestSearch(env=ENV)
# create a list of lists of ids to query the ManifestSearch object
chunked_list = split_list(proteomic_output_df['in_manifest'].explode().unique().tolist())
manifest = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the ManifestSearch object looking for recordds where the id field is in the chunk of ids from proteomic_output_df
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"id": {{"$in": {filter_list}}}}}'
    # get the results
    manifest += m_client.get_record_by_filter(filter=filter, max_page_size=100, all_pages=True)

display(manifest)

[{'id': 'nmdc:manif-11-7796sg87',
  'manifest_category': 'instrument_run',
  'type': 'nmdc:Manifest',
  'description': 'collection of metaproteomic analyses from the same instrument run nmdc:sty-11-aygzgv51'}]

#example unfiltered results 
unfilt_results = proteomic_output_df.iloc[0]["url"]
print(unfilt_results)
display(agg_func.tsv_extract(unfilt_results))

del unfilt_results

https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_msgfplus_syn_PlusSICStats.txt

unfilt_res = agg_func.iterate_file_extract(identifier_col='processed_DO_id',
                                url_col='url',
                                extract_cols=['Charge','Scan','Peptide','Protein','MSGFDB_SpecEValue','StatMomentsArea'],
                                pd_df=proteomic_output_df,
                                filter_col = None,
                                filter_values = None,
                                file_type='tsv'
                                )


#create identifier for each scan in each dataset
unfilt_res["SpecID"] = unfilt_res.apply(lambda row: str(row["id_col"]) + "_" + str(row["Scan"]), axis=1)

#clean off the prefix and suffix from the sequence but keep any mods
unfilt_res["Peptide Sequence with Mods"] = unfilt_res["Peptide"].apply(agg_func.sequence_noprefsuff)
del unfilt_res['Peptide']

#determine protein type (contaminant, reverse, forward)
unfilt_res["Protein_Type"] = unfilt_res["Protein"].apply(agg_func.findproteinname)

unfilt_res

edata = unfilt_res[['SpecID','Peptide Sequence with Mods','MSGFDB_SpecEValue','Protein_Type','StatMomentsArea']].drop_duplicates()  # important to remove any duplicated rows here!

#for each SpecID, select the peptide spectrum match with the smallest MSGFDB_SpecEValue (.idxmin() takes the first entry when there's multiple matches)
idx = edata.groupby(['SpecID'])['MSGFDB_SpecEValue'].idxmin()
edata = edata.loc[idx].reset_index(drop=True)
del idx

display(edata)

assert len(edata['SpecID'].unique())==edata.shape[0], "still more than one identification per scan"

forward_peptides = edata[edata["Protein_Type"] == "Forward"].copy().reset_index(drop=True)
del forward_peptides["Protein_Type"]
display(forward_peptides)

reversed_peptides = edata[edata["Protein_Type"] == "Reversed"].copy().reset_index(drop=True)
del reversed_peptides["Protein_Type"]
del edata
display(reversed_peptides)

#initial guess at a log10 spectral probability filter value
initial_specprob_filter = -15

#perform optimization
optimization = agg_func.optimize_specFilt(initial_specprob_filter,forward_peptides,reversed_peptides)

#visualize filter
fitted_params=optimization.x
fig, ax_main = plt.subplots()
ax_inset = plt.axes([0.45, 0.45, 0.35, 0.35])


# Main plot
# forward peptides
hist, bins = np.histogram(forward_peptides["MSGFDB_SpecEValue"], bins=50)
ax_main.bar(bins[:-1], hist, width=np.diff(bins), align='edge', color='g', label='forward')

# reverse peptides
hist, bins = np.histogram(reversed_peptides["MSGFDB_SpecEValue"], bins=50)
ax_main.bar(bins[:-1], hist, width=np.diff(bins), align='edge',color='r', alpha=0.7, label='reverse')

# filter cutoff
ax_main.axvline(x=10 ** fitted_params[0], color="black", label = 'filter cutoff', linestyle="--")
ax_main.set_ylabel('number of peptide spectrum matches')
ax_main.set_xlabel('MSGFDB_SpecEValue')
ax_main.ticklabel_format(style='plain', axis='x')
ax_main.set_title(f'impact of spectral probability filter {10 ** fitted_params[0]}')
ax_main.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)


# Inset plot
# forward peptides
hist, bins = np.histogram(forward_peptides[forward_peptides["MSGFDB_SpecEValue"] < 0.000000002]["MSGFDB_SpecEValue"], bins=18)
ax_inset.bar(bins[:-1], hist, width=np.diff(bins), align='edge', color='g')

# reverse peptides
hist, bins = np.histogram(reversed_peptides[reversed_peptides["MSGFDB_SpecEValue"] < 0.000000002]["MSGFDB_SpecEValue"], bins=18)
ax_inset.bar(bins[:-1], hist, width=np.diff(bins), align='edge',color='r', alpha=0.7)

# filter cutoff
ax_inset.axvline(x=10 ** fitted_params[0], color="black", label = 'filter cutoff', linestyle="--")
ax_inset.set_xlabel('MSGFDB_SpecEValue')

plt.show

<function matplotlib.pyplot.show(close=None, block=None)>

#Filter the data according to the filter and recalculate FDR
forward_peptides = forward_peptides[
    (forward_peptides["MSGFDB_SpecEValue"] < 10 ** optimization.x[0])
].copy()

reversed_peptides = reversed_peptides[
    (reversed_peptides["MSGFDB_SpecEValue"] < 10 ** optimization.x[0])
].copy()

# Calculate FDR
f_spec = (forward_peptides["SpecID"].unique().size)
r_spec = reversed_peptides["SpecID"].unique().size
if (f_spec == 0) & (r_spec == 0):
    fdr_spec = 1
else:
    fdr_spec = (2*r_spec) / (f_spec + r_spec)

f_pep = forward_peptides["Peptide Sequence with Mods"].unique().size
r_pep = reversed_peptides["Peptide Sequence with Mods"].unique().size
if (f_pep == 0) & (r_pep == 0):
    fdr_pep = 1
else:
    fdr_pep = (r_pep) / (f_pep + r_pep)

del f_spec, r_spec, f_pep, r_pep

print("Spectral FDR:",fdr_spec,"\nPeptide FDR:",fdr_pep)

Spectral FDR: 0.049787398531117126 
Peptide FDR: 0.05560704355885079

#extract data set id
forward_peptides['processed_DO_id'] = forward_peptides['SpecID'].str.split('_').str[0]

#drop SpecID and spectral probability columns since no longer relevant
forward_peptides.drop(['SpecID','MSGFDB_SpecEValue'],axis=1, inplace=True)

#for each peptide sequence with mods, sum the abundances for all scans/identifications
forward_peptides = forward_peptides.groupby(['processed_DO_id','Peptide Sequence with Mods'])['StatMomentsArea'].sum().to_frame().reset_index()

display(forward_peptides)

#plot untransformed abundance values
untransformed_abundances_fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(x='processed_DO_id',y='StatMomentsArea',data=forward_peptides)
plt.ticklabel_format(style='plain', axis='y')
plt.xticks([])
plt.xlabel('sample')
plt.ylabel('Relative Peptide Abundance (Un-Normalized)')

Text(0, 0.5, 'Relative Peptide Abundance (Un-Normalized)')

#log2 tranformation
forward_peptides['StatMomentsAreaLog2']=np.log2(forward_peptides['StatMomentsArea'])

# Calculate group-wise (sample wise) median
group_medians = forward_peptides.groupby('processed_DO_id')['StatMomentsAreaLog2'].median()

# Calculate data wide median
all_data_median=forward_peptides['StatMomentsAreaLog2'].median()

# Subtract sample wise median from each value within its group
forward_peptides['StatMomentsAreaLog+Norm'] = forward_peptides.apply(
    lambda row: row['StatMomentsAreaLog2'] - group_medians[row['processed_DO_id']], axis=1
)

#add back in a data wide median value to avoid negative abundances
forward_peptides['StatMomentsAreaLog+Norm'] = forward_peptides['StatMomentsAreaLog+Norm'] + all_data_median

transformed_abundances_fig, ax = plt.subplots(figsize=(8,6))
sns.boxplot(x='processed_DO_id',y='StatMomentsAreaLog+Norm',data=forward_peptides)
plt.xticks([])
plt.xlabel('sample')
plt.ylabel('Relative Peptide Abundance (Normalized)')

del ax, group_medians, all_data_median, forward_peptides['StatMomentsArea'], forward_peptides['StatMomentsAreaLog2'], reversed_peptides

peptide_protein_mapping = pd.DataFrame(unfilt_res[unfilt_res['Peptide Sequence with Mods'].isin(forward_peptides['Peptide Sequence with Mods'])][['Peptide Sequence with Mods','Protein']].drop_duplicates()).reset_index(drop=True)
peptide_protein_mapping

# Get the GFF inputs for these workflow version results
workflow_inputs = version_output.drop(columns=["has_output","version"]).explode("has_input")['has_input']
gffs = data_objects[data_objects['data_object_type']=="Functional Annotation GFF"]
workflow_gffs = gffs[gffs['id'].isin(workflow_inputs)]
workflow_gffs

#example annotation file
annotation_file=workflow_gffs.iloc[0]["url"]
print(annotation_file)
display(agg_func.gff_extract_features(annotation_file))

del annotation_file

https://data.microbiomedata.org/data/nmdc:omprc-13-e4rebn56/nmdc:wfmgan-11-bnzfrb88.1/nmdc_wfmgan-11-bnzfrb88.1_functional_annotation.gff

#get gene annotation information for proteins
genemapping = workflow_gffs[["id", "url"]].drop_duplicates().reset_index(drop=True)

genemapping = agg_func.iterate_file_extract(
    pd_df=genemapping,\
    identifier_col='id',\
    url_col='url',\
    extract_cols=['ID','product','product_source'],\
    filter_col = 'ID',
    filter_values = peptide_protein_mapping['Protein'].unique().tolist(),
    file_type='gff'
)

#merge with protein mapping information. drop columns ID (which is equivalent to Protein) and id_col (which is the dataset id, unnecessary here since peptide to protein information isn't dataset specific)
annotation_mapping = genemapping.merge(peptide_protein_mapping,left_on='ID',right_on='Protein').drop(['ID','id_col'],axis=1)

annotation_mapping

razormapping = agg_func.razorprotein(annotation_mapping)
razormapping

#merge needs to be 'right' because some peptides are removed in mapping functions if they have indeterminante mappings
forward_peptides = forward_peptides.merge(razormapping,how='right',on=['Peptide Sequence with Mods'])
del annotation_mapping, peptide_protein_mapping, genemapping

forward_peptides

#de-log abundance
forward_peptides['StatMomentsAreaNorm']=2**forward_peptides['StatMomentsAreaLog+Norm']
forward_peptides.drop('StatMomentsAreaLog+Norm',axis=1,inplace=True)

#sum abundance for each sorted_list protein in each dataset
prot_info = forward_peptides.columns[~forward_peptides.columns.isin(['StatMomentsAreaNorm','Peptide Sequence with Mods'])].tolist()
protein_abundances = forward_peptides.groupby(prot_info)['StatMomentsAreaNorm'].sum().reset_index()

#re-log2 tranform abundance
protein_abundances['StatMomentsAreaLog+Norm']=np.log2(protein_abundances['StatMomentsAreaNorm'])
protein_abundances.drop('StatMomentsAreaNorm',axis=1,inplace=True)

del prot_info, forward_peptides

protein_abundances

#pivot to wide crosstab
aggregated_proteomic_output = protein_abundances.pivot(index='razor',columns='processed_DO_id',values='StatMomentsAreaLog+Norm')
aggregated_proteomic_output.columns.name = None

aggregated_proteomic_output

from nmdc_api_utilities.biosample_search import BiosampleSearch
# since we are querying the biosample collection, we need to create an instance of it
bs_client = BiosampleSearch(env=ENV)

# create a list of lists of ids to query the BiosampleSearch object
chunked_list = split_list(proteomic_output_df['biosample_id'].unique().tolist())
biosamples = []
# loop through the chunked list of ids
for chunk in chunked_list:
    # create the filter - query the BiosampleSearch object looking for records where the id field is in the chunk of ids from proteomic_output_df
    filter_list = dp_client._string_mongo_list(chunk)
    filter = f'{{"id": {{"$in": {filter_list}}}}}'
    # get the results
    biosamples += bs_client.get_record_by_filter(filter=filter,fields="id,depth.has_numeric_value", max_page_size=100, all_pages=True)

#normalize the json columns
biosamples = pd.json_normalize(biosamples).drop_duplicates().rename(columns={'id':'biosample_id'})

#merge biosample and processed data object information
sample_metadata = biosamples.merge(proteomic_output_df[['processed_DO_id','biosample_id']],on='biosample_id')
del biosamples

sample_metadata

	id	name	file_size_bytes	data_object_type	md5_checksum	url	biosample_id	in_manifest	was_generated_by
0	nmdc:dobj-11-b8aph505	nmdc_wfmgan-11-bnzfrb88.1_imgap.info	470	Annotation Info File	2854a85740fdd20dd61bbe5c9897cc24	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-0jw5n594	NaN	NaN
1	nmdc:dobj-11-gxq95j86	nmdc_wfmgan-11-bnzfrb88.1_prodigal.gff	90723471	Prodigal Annotation GFF	47f381a56527a186939f638fa0486d5e	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-0jw5n594	NaN	NaN
2	nmdc:dobj-11-rsbcxm60	nmdc_wfmag-11-2xsazh74.1_gtdbtk.ar122.summary.tsv	49	GTDBTK Archaeal Summary	87d56b6d56bb79e45844847a53a054a7	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-0jw5n594	NaN	nmdc:wfmag-11-2xsazh74.1
...	...	...	...	...	...	...	...	...	...
7	nmdc:dobj-13-8eencq04	output: SBR_FC_N3_20-30_H2Oext_13Oct15_Leopard...	30542330	Direct Infusion FT ICR-MS Raw Data	NaN	NaN	nmdc:bsm-13-zq681s85	NaN	NaN
8	nmdc:dobj-13-3r1cxb21	SBR_FC_N3_20-30_H2Oext_13Oct15_Leopard_1_01_44...	20800	Direct Infusion FT-ICR MS Analysis Results	c463c5f2768a32f55bc46a1b04229d21	https://nmdcdemo.emsl.pnnl.gov/nom/results/SBR...	nmdc:bsm-13-zq681s85	NaN	nmdc:wfnom-13-9s4y5107.1
9	nmdc:dobj-13-c6rdvy62	output: Unground_SBR_Spring_2014_FC_N3_20-30_H...	30556452	Direct Infusion FT ICR-MS Raw Data	NaN	NaN	nmdc:bsm-13-zq681s85	NaN	NaN

	biosample_id	data_object_type	count
0	nmdc:bsm-13-0jw5n594	Unfiltered Metaproteomics Results	3
1	nmdc:bsm-13-13145k83	Unfiltered Metaproteomics Results	3
2	nmdc:bsm-13-1p0tct86	Unfiltered Metaproteomics Results	3
...	...	...	...
30	nmdc:bsm-13-xvf06c81	Unfiltered Metaproteomics Results	3
31	nmdc:bsm-13-yjp2dx65	Unfiltered Metaproteomics Results	3
32	nmdc:bsm-13-zms1jq91	Unfiltered Metaproteomics Results	3

	id	has_input	has_output	version
7	nmdc:wfmp-11-j2tktj20.1	[nmdc:dobj-11-ms76kj12, nmdc:dobj-11-dyq78w56,...	[nmdc:dobj-11-2k6s1505, nmdc:dobj-11-m6bks270,...	v1.2.1
8	nmdc:wfmp-11-3z777070.1	[nmdc:dobj-11-tezmee57, nmdc:dobj-11-2yvsr009,...	[nmdc:dobj-11-2szyc560, nmdc:dobj-11-d6yc9e02,...	v1.2.1
9	nmdc:wfmp-11-7h4m6t36.1	[nmdc:dobj-11-zd25ft61, nmdc:dobj-11-92qp1w39,...	[nmdc:dobj-11-301yh759, nmdc:dobj-11-v0780021,...	v1.2.1
...	...	...	...	...
90	nmdc:wfmp-11-k5qcay19.1	[nmdc:dobj-11-hp2vmj63, nmdc:dobj-11-p55wx268,...	[nmdc:dobj-11-z8hc0j30, nmdc:dobj-11-55vqpg72,...	v1.2.1
91	nmdc:wfmp-11-f6sfn088.1	[nmdc:dobj-11-mzxj8743, nmdc:dobj-11-nba9wh55,...	[nmdc:dobj-11-zhz66007, nmdc:dobj-11-ppng2124,...	v1.2.1
93	nmdc:wfmp-11-xm389s70.1	[nmdc:dobj-11-6n49rm30, nmdc:dobj-11-ysn0es28,...	[nmdc:dobj-11-zn9eyg46, nmdc:dobj-11-pp5wnn85,...	v1.2.1

	processed_DO_id	name	file_size_bytes	data_object_type	md5_checksum	url	biosample_id	in_manifest	was_generated_by	workflow_id	workflow_version
0	nmdc:dobj-11-pxnhdj84	nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_ms...	5949491	Unfiltered Metaproteomics Results	0b72c4481cd4664e43be04aa5fd9095c	https://nmdcdemo.emsl.pnnl.gov/proteomics/resu...	nmdc:bsm-13-0jw5n594	[nmdc:manif-11-7796sg87]	NaN	nmdc:wfmp-11-xdxeh614.1	v1.2.1
1	nmdc:dobj-11-xm1yjv87	nmdc_dobj-11-k9e5nm54_nmdc_dobj-11-tt8ykk73_ms...	4796320	Unfiltered Metaproteomics Results	29890dae8274f31916edfd12a382c21a	https://nmdcdemo.emsl.pnnl.gov/proteomics/resu...	nmdc:bsm-13-13145k83	[nmdc:manif-11-7796sg87]	NaN	nmdc:wfmp-11-2zhvtt21.1	v1.2.1
2	nmdc:dobj-11-bxyvx506	nmdc_dobj-11-xvmb4058_nmdc_dobj-11-bvya7a43_ms...	10580159	Unfiltered Metaproteomics Results	205bc0ca0620058a156011681d07832d	https://nmdcdemo.emsl.pnnl.gov/proteomics/resu...	nmdc:bsm-13-1p0tct86	[nmdc:manif-11-7796sg87]	NaN	nmdc:wfmp-11-f3tfd964.1	v1.2.1
...	...	...	...	...	...	...	...	...	...	...	...
28	nmdc:dobj-11-m808mj05	nmdc_dobj-11-5bdpnq27_nmdc_dobj-11-dee5yq25_ms...	9752203	Unfiltered Metaproteomics Results	309193938b35a0053fb742da0a6db5d6	https://nmdcdemo.emsl.pnnl.gov/proteomics/resu...	nmdc:bsm-13-xvf06c81	[nmdc:manif-11-7796sg87]	NaN	nmdc:wfmp-11-gm9s6156.1	v1.2.1
29	nmdc:dobj-11-301yh759	nmdc_dobj-11-zd25ft61_nmdc_dobj-11-vek6s281_ms...	10756563	Unfiltered Metaproteomics Results	c82e8026a2d974cb4f9a75b6eef5c62b	https://nmdcdemo.emsl.pnnl.gov/proteomics/resu...	nmdc:bsm-13-yjp2dx65	[nmdc:manif-11-7796sg87]	NaN	nmdc:wfmp-11-7h4m6t36.1	v1.2.1
30	nmdc:dobj-11-sx7cyr58	nmdc_dobj-11-agsd2f41_nmdc_dobj-11-8yvaz057_ms...	9481077	Unfiltered Metaproteomics Results	d30cc89f22653f3be1b5f8e35d972caa	https://nmdcdemo.emsl.pnnl.gov/proteomics/resu...	nmdc:bsm-13-zms1jq91	[nmdc:manif-11-7796sg87]	NaN	nmdc:wfmp-11-emfy6143.1	v1.2.1

	ResultID	Scan	FragMethod	SpecIndex	Charge	PrecursorMZ	DelM	DelM_PPM	MH	Peptide	...	PeakMaxIntensity	PeakSignalToNoiseRatio	FWHMInScans	PeakArea	ParentIonIntensity	ParentIonMZ	StatMomentsArea	PeakScanStart	PeakScanEnd	PeakWidthMinutes
0	1	40580	HCD	1	2	871.37396	0.03180	18.27841	1740.705490	R.GSSSGGGYSSGSSSYGSGGR.Q	...	554184.0	40.130	26	1.600100e+07	544336.0	871.37	1.703900e+07	40523	40650	0
1	2	40920	HCD	2	3	721.68713	-0.00951	-4.39777	2163.056312	R.LGEDNINVVEGNEQFISASK.S	...	3663900.0	15.850	175	4.756700e+08	2424600.0	721.69	4.050300e+08	40672	41092	0
2	3	41460	HCD	3	2	1082.02820	-0.00732	-3.38447	2163.056312	R.LGEDNINVVEGNEQFISASK.S	...	9434500.0	97.190	117	1.130600e+09	1766900.0	1082.03	1.024900e+09	41359	41933	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25312	25313	33683	HCD	19762	2	768.35883	0.00785	5.11714	1535.702530	R.RQWWEDYEPAR.A	...	648678.0	10.270	53	2.078300e+07	187764.0	768.36	4.061700e+06	33613	33815	0
25313	25314	17825	HCD	19763	2	555.74408	0.00061	0.54733	1111.483631	-.NGQQECFTGK.W	...	1287000.0	28.720	39	6.254500e+07	909122.0	555.74	6.147000e+07	17776	17844	0
25314	25315	28560	HCD	19764	2	712.34711	-0.02725	-19.13751	1424.717465	R.RTRRDYAECVR.E	...	738629.0	7.464	612	2.873400e+08	224858.0	712.35	1.981300e+08	27804	28782	0

Proteomic Data Aggregation and Visualization¶

Define a function to split a list into chunks¶

1) Assess background information and collect data for an example study of riverbed sediment along the Columbia River¶

2) Apply a spectral probability filter across the data that optimizes the number of identifications for an FDR of 0.05¶

3) Collapse to unique peptides and normalize their relative abundance¶

4) Extract functional gene annotations for proteins¶

5) Generate annotation and protein mappings for peptides using "Razor" strategy¶

6) Perform protein rollup and summarize into a final aggregated table of relative protein abundance¶

Final aggregated table of relative protein abundance¶

	Charge	Scan	Protein	MSGFDB_SpecEValue	StatMomentsArea	id_col	SpecID	Peptide Sequence with Mods	Protein_Type
0	2	40580	Contaminant_K22E_HUMAN	1.585800e-23	1.703900e+07	nmdc:dobj-11-pxnhdj84	nmdc:dobj-11-pxnhdj84_40580	GSSSGGGYSSGSSSYGSGGR	None
1	3	40920	Contaminant_TRYP_BOVIN	5.821000e-23	4.050300e+08	nmdc:dobj-11-pxnhdj84	nmdc:dobj-11-pxnhdj84_40920	LGEDNINVVEGNEQFISASK	None
2	2	41460	Contaminant_TRYP_BOVIN	6.047300e-23	1.024900e+09	nmdc:dobj-11-pxnhdj84	nmdc:dobj-11-pxnhdj84_41460	LGEDNINVVEGNEQFISASK	None
...	...	...	...	...	...	...	...	...	...
945188	2	7678	nmdc:wfmgan-11-21ms6426.1_080641_2_412	4.994500e-07	5.758500e+08	nmdc:dobj-11-sx7cyr58	nmdc:dobj-11-sx7cyr58_7678	EELEAENAR	Forward
945189	2	21462	XXX_nmdc:wfmgan-11-21ms6426.1_123328_2_337	4.996600e-07	8.352100e+08	nmdc:dobj-11-sx7cyr58	nmdc:dobj-11-sx7cyr58_21462	YMISDAR	Reversed
945190	4	18095	nmdc:wfmgan-11-21ms6426.1_106032_3_359	4.998400e-07	2.502100e+08	nmdc:dobj-11-sx7cyr58	nmdc:dobj-11-sx7cyr58_18095	DRLGEVFPDAEFESAYAGR	Forward

	SpecID	Peptide Sequence with Mods	MSGFDB_SpecEValue	Protein_Type	StatMomentsArea
0	nmdc:dobj-11-2k6s1505_10005	PPDERERSEEAEKRDEERDRVRDELLAGAEEGEPR	4.994300e-07	Forward	1.238500e+09
1	nmdc:dobj-11-2k6s1505_10008	LRAGSEPR	1.172600e-07	Forward	1.234600e+08
2	nmdc:dobj-11-2k6s1505_10009	WAKEIENQK	2.473900e-07	Forward	5.731600e+07
...	...	...	...	...	...
797680	nmdc:dobj-11-zn9eyg46_9976	CRDKWYQMAEGK	4.110300e-07	Reversed	6.141800e+06
797681	nmdc:dobj-11-zn9eyg46_9979	WWQQINK	4.622300e-08	Reversed	4.143000e+07
797682	nmdc:dobj-11-zn9eyg46_9980	RFEECERECK	1.126600e-07	Forward	2.284500e+06

	SpecID	Peptide Sequence with Mods	MSGFDB_SpecEValue	StatMomentsArea
0	nmdc:dobj-11-2k6s1505_10013	EFVDIISYMENENHSDIEYPLLYKWDSKSTVINR	1.021500e-07	7.371400e+08
1	nmdc:dobj-11-2k6s1505_10019	QWHPNFLR	3.448500e-07	3.220300e+09
2	nmdc:dobj-11-2k6s1505_10026	LAEREGGAR	1.451200e-08	9.604800e+08
...	...	...	...	...
383749	nmdc:dobj-11-zn9eyg46_9969	TDTATHWHK	1.879300e-07	1.960000e+08
383750	nmdc:dobj-11-zn9eyg46_9976	CRDKWYQMAEGK	4.110300e-07	6.141800e+06
383751	nmdc:dobj-11-zn9eyg46_9979	WWQQINK	4.622300e-08	4.143000e+07

	processed_DO_id	Peptide Sequence with Mods	StatMomentsArea
0	nmdc:dobj-11-2k6s1505	AALAANSELMNK	115710000.0
1	nmdc:dobj-11-2k6s1505	AFTVDEMR	59240000.0
2	nmdc:dobj-11-2k6s1505	AGGSITSSGGVERNDVDYSAVLTK	5785600.0
...	...	...	...
9757	nmdc:dobj-11-zn9eyg46	VAELEATTAR	521284000.0
9758	nmdc:dobj-11-zn9eyg46	VAELEATTVR	20899000.0
9759	nmdc:dobj-11-zn9eyg46	WLWNYYQYYK	582031.0

	Peptide Sequence with Mods	Protein
0	TDAGVNLGGTAVDGGGIR	nmdc:wfmgan-11-bnzfrb88.1_150866_1_309
1	ETNPTLADIEQDSK	nmdc:wfmgan-11-bnzfrb88.1_028542_140_466
2	DYGTGNVYEMR	nmdc:wfmgan-11-bnzfrb88.1_129219_3_329
...	...	...
16091	NPEEWFGR	nmdc:wfmgan-11-21ms6426.1_076013_1_423
16092	ITTEPDVIK	nmdc:wfmgan-11-21ms6426.1_021696_2_664
16093	ITTEPDVIK	nmdc:wfmgan-11-21ms6426.1_086496_2_397

	id	name	file_size_bytes	data_object_type	md5_checksum	url	biosample_id	in_manifest	was_generated_by
70	nmdc:dobj-11-fdngms90	nmdc_wfmgan-11-bnzfrb88.1_functional_annotatio...	77411808	Functional Annotation GFF	e4344ebdca641bd24709f89b8ff83dd3	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-0jw5n594	NaN	NaN
76	nmdc:dobj-11-xxffzc65	nmdc_wfmgan-11-dvvh0237.1_functional_annotatio...	66314886	Functional Annotation GFF	9051bc023e5b14373dc016e6a3a0943b	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-13145k83	NaN	NaN
19	nmdc:dobj-11-c24s6186	nmdc_wfmgan-11-ewrwts75.1_functional_annotatio...	114196066	Functional Annotation GFF	3049a60a48baca9133461a7cde827b6f	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-1p0tct86	NaN	NaN
...	...	...	...	...	...	...	...	...	...
52	nmdc:dobj-11-tv82b372	nmdc_wfmgan-11-7j1ws791.1_functional_annotatio...	60997438	Functional Annotation GFF	cfd4e4c3c17d4d2020f364555836c655	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-xvf06c81	NaN	NaN
54	nmdc:dobj-11-92qp1w39	nmdc_wfmgan-11-yjeh0m30.1_functional_annotatio...	76608329	Functional Annotation GFF	cda53e77cb70328498abc29685baf61c	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-yjp2dx65	NaN	NaN
21	nmdc:dobj-11-2f3gzn94	nmdc_wfmgan-11-21ms6426.1_functional_annotatio...	80194278	Functional Annotation GFF	8e69768342ba2c92cd7feac23a8d933f	https://data.microbiomedata.org/data/nmdc:ompr...	nmdc:bsm-13-zms1jq91	NaN	NaN

	ID	translation_table	partial	start_type	product	product_source	cath_funfam	cog	pfam	superfamily	...	bound_moiety	ncRNA_class	intron_start	intron_end	number_of_repeats	average_repeat_length	median_repeat_length	average_spacer_length	median_spacer_length	Parent
0	nmdc:wfmgan-11-bnzfrb88.1_000001_3_2729	11	5'	Edge	diaminopimelate decarboxylase/amino acid trans...	COG0019/COG0531	2.40.37.10,3.20.20.10	COG0019,COG0531	PF00278,PF02784,PF13520	51419	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	nmdc:wfmgan-11-bnzfrb88.1_000001_2750_2869	11	NaN	ATG	hypothetical protein	Hypo-rule applied	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	nmdc:wfmgan-11-bnzfrb88.1_000001_2910_3746	11	NaN	ATG	biotin carboxyl carrier protein	COG0511	2.70.70.10	COG0511	PF01551	51230	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
272943	nmdc:wfmgan-11-bnzfrb88.1_208965_2_199	11	5',3'	Edge	F-type H+/Na+-transporting ATPase subunit alpha	KO:K02111	1.20.150.20	COG0056	PF00306	47917	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
272944	nmdc:wfmgan-11-bnzfrb88.1_208966_2_199	11	5',3'	Edge	S-adenosyl-L-methionine hydrolase (adenosine-f...	KO:K22205	3.40.50.10790	COG1912	PF01887	102522	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
272945	nmdc:wfmgan-11-bnzfrb88.1_208967_69_200	11	3'	ATG	DNA-binding NarL/FixJ family response regulator	COG2197	3.40.50.2300	COG2197	NaN	52172	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	product	product_source	Peptide Sequence with Mods	Protein
0	DNA-directed RNA polymerase subunit beta	KO:K03043	ALMGSNMQR	nmdc:wfmgan-11-bnzfrb88.1_000010_2256_6224
1	heme-degrading monooxygenase HmoA	COG2329	ENADAYSRGAYPDVLK	nmdc:wfmgan-11-bnzfrb88.1_000125_3919_4230
2	uncharacterized protein	KO:K09927	RGTWWDWAPAK	nmdc:wfmgan-11-bnzfrb88.1_000220_2203_3417
...	...	...	...	...
16091	hypothetical protein	Hypo-rule applied	EAPITALDHK	nmdc:wfmgan-11-21ms6426.1_215294_58_276
16092	hypothetical protein	Hypo-rule applied	LWAVDLATGSK	nmdc:wfmgan-11-21ms6426.1_216224_1_207
16093	hypothetical protein	Hypo-rule applied	TLFLDVGSR	nmdc:wfmgan-11-21ms6426.1_216224_1_207

	product	product_source	Peptide Sequence with Mods	razor
0	dodecin	KO:K09165	AAAAAIETASR	nmdc:wfmgan-11-p3dkkz54.1_002072_639_854
1	dodecin, dodecin, dodecin, dodecin, dodecin	KO:K09165, KO:K09165, KO:K09165, KO:K09165, KO...	AAAAAVETASR	nmdc:wfmgan-11-6pq4bd13.1_044073_3_326, nmdc:w...
2	subtilisin family serine protease	COG1404	AAAEPFHTAR	nmdc:wfmgan-11-21ms6426.1_002328_1282_2322
...	...	...	...	...
4828	hypothetical protein	Hypo-rule applied	YYADISAWLVDVR	nmdc:wfmgan-11-21ms6426.1_128887_2_331
4829	hypothetical protein	Hypo-rule applied	YYGFNWGR	nmdc:wfmgan-11-dvvh0237.1_120316_3_251
4830	uncharacterized protein	KO:K06996	YYSELFGWQVHEVMPTYGLVHTEAGGK	nmdc:wfmgan-11-v53k8v91.1_045759_189_458

	processed_DO_id	Peptide Sequence with Mods	StatMomentsAreaLog+Norm	product	product_source	razor
0	nmdc:dobj-11-vjmd5h28	AAAAAIETASR	30.820153	dodecin	KO:K09165	nmdc:wfmgan-11-p3dkkz54.1_002072_639_854
1	nmdc:dobj-11-tk9hc081	AAAAAVETASR	29.374877	dodecin, dodecin, dodecin, dodecin, dodecin	KO:K09165, KO:K09165, KO:K09165, KO:K09165, KO...	nmdc:wfmgan-11-6pq4bd13.1_044073_3_326, nmdc:w...
2	nmdc:dobj-11-sx7cyr58	AAAEPFHTAR	28.081336	subtilisin family serine protease	COG1404	nmdc:wfmgan-11-21ms6426.1_002328_1282_2322
...	...	...	...	...	...	...
8599	nmdc:dobj-11-sx7cyr58	YYADISAWLVDVR	27.222319	hypothetical protein	Hypo-rule applied	nmdc:wfmgan-11-21ms6426.1_128887_2_331
8600	nmdc:dobj-11-xm1yjv87	YYGFNWGR	23.220936	hypothetical protein	Hypo-rule applied	nmdc:wfmgan-11-dvvh0237.1_120316_3_251
8601	nmdc:dobj-11-7psagy79	YYSELFGWQVHEVMPTYGLVHTEAGGK	27.015979	uncharacterized protein	KO:K06996	nmdc:wfmgan-11-v53k8v91.1_045759_189_458

	processed_DO_id	product	product_source	razor	StatMomentsAreaLog+Norm
0	nmdc:dobj-11-2k6s1505	DNA-binding protein HU-beta, DNA-binding prote...	KO:K03530, KO:K03530	nmdc:wfmgan-11-8zxjs882.1_040719_104_331, nmdc...	25.187616
1	nmdc:dobj-11-2k6s1505	DNA-binding transcriptional LysR family regula...	COG0583, COG0583, COG0583, COG0583	nmdc:wfmgan-11-21ms6426.1_010178_1_894, nmdc:w...	26.332428
2	nmdc:dobj-11-2k6s1505	DNA-directed RNA polymerase subunit beta'	KO:K03046	nmdc:wfmgan-11-yjeh0m30.1_003176_3_1979	27.042324
...	...	...	...	...	...
6450	nmdc:dobj-11-zn9eyg46	peptidoglycan/xylan/chitin deacetylase (PgdA/C...	COG0726, COG0726, COG0726	nmdc:wfmgan-11-6pq4bd13.1_001294_1_411, nmdc:w...	25.123978
6451	nmdc:dobj-11-zn9eyg46	photosystem II CP43 chlorophyll apoprotein, ph...	KO:K02705, KO:K02705, KO:K02705	nmdc:wfmgan-11-hyze3t58.1_07201_3_464, nmdc:wf...	28.599485
6452	nmdc:dobj-11-zn9eyg46	predicted porin	COG3203	nmdc:wfmgan-11-hyze3t58.1_02099_286_783	29.142025

	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2szyc560	nmdc:dobj-11-301yh759	nmdc:dobj-11-3378mg86	nmdc:dobj-11-3ak2bc31	nmdc:dobj-11-4bwzhs42	nmdc:dobj-11-7psagy79	nmdc:dobj-11-7r5yea49	nmdc:dobj-11-7x21d450	nmdc:dobj-11-bxyvx506	...	nmdc:dobj-11-tk9hc081	nmdc:dobj-11-vjmd5h28	nmdc:dobj-11-wcn69x37	nmdc:dobj-11-xm1yjv87	nmdc:dobj-11-y2aefn18	nmdc:dobj-11-yrbs4v96	nmdc:dobj-11-z47wnf07	nmdc:dobj-11-z8hc0j30	nmdc:dobj-11-zhz66007	nmdc:dobj-11-zn9eyg46
razor
nmdc:wfmgan-11-21ms6426.1_000036_702_1004	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
nmdc:wfmgan-11-21ms6426.1_000053_3_563, nmdc:wfmgan-11-p3dkkz54.1_008013_251_1072	NaN	NaN	NaN	NaN	NaN	NaN	30.499817	NaN	NaN	NaN	...	29.083049	32.948118	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
nmdc:wfmgan-11-21ms6426.1_000057_4146_5369, nmdc:wfmgan-11-p3dkkz54.1_002120_389_1612, nmdc:wfmgan-11-z0raab34.1_009908_3_737	NaN	30.067875	28.391404	28.935731	NaN	NaN	30.458277	NaN	NaN	28.7232	...	NaN	30.467756	26.462791	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
nmdc:wfmgan-11-z0raab34.1_148395_2_283	NaN	31.373664	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
nmdc:wfmgan-11-z0raab34.1_160266_3_278	NaN	27.750197	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
nmdc:wfmgan-11-z0raab34.1_167280_3_278	NaN	24.717223	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN