data_objects <- get_data_objects_for_study("nmdc:sty-11-aygzgv51") %>%
  # Remove unnecessary columns for simpler dataframe
  select(id, name, data_object_type, url, biosample_id, in_manifest) %>%
  # Flatten in_manifest
  mutate(in_manifest = as.character(in_manifest))

proteomic_output_df <- data_objects %>%
  filter(data_object_type == "Unfiltered Metaproteomics Results") %>%
  dplyr::rename(processed_dobj_id = "id") %>%
  arrange(biosample_id)

head(proteomic_output_df)

# Retrieve workflow records that created the processed data results
version_output <- get_results_by_id(
    collection = "workflow_execution_set", 
    match_id_field = "has_output", 
    id_list = proteomic_output_df$processed_dobj_id, 
    fields = "id,has_output,version,has_input") 

proteomic_output_df <- version_output %>%

    # We will use has_input later on to search for Functional Annotation files
    select(-has_input) %>%

    # Clean up response
    unnest(has_output) %>%
    dplyr::rename(workflow_id = "id",
                  processed_dobj_id = "has_output") %>%

    # Join workflow information
    right_join(proteomic_output_df, by = join_by(processed_dobj_id)) %>%

    # Filter to only use v.1.2.1
    filter(version == "v1.2.1") %>%

    # Remove extra columns
    dplyr::select(-c(workflow_id, version))

head(proteomic_output_df)

# Display manifest IDs for the records in proteomic_output_df
manifest_id <- unique(proteomic_output_df$in_manifest)
manifest_id

# In this case there is only one, print manifest information
manifest <- get_results_by_id(collection = "manifest_set", 
                              match_id_field = "id", 
                              id_list = manifest_id, 
                              fields = "")
manifest

paste("Reading file from", proteomic_output_df$url[1])

head(read_tsv(proteomic_output_df$url[1], show_col_types = FALSE, progress = FALSE))

unfiltered_results <- iterate_file_extract(
  input_df = proteomic_output_df,
  identifier_col = "processed_dobj_id",
  url_col = "url", 
  extract_cols = c("Charge", "Scan", "Peptide", "Protein", "MSGFDB_SpecEValue", "StatMomentsArea"),
  file_type = "tsv") %>%
                                           
  # Create identifier for each scan in each dataset
  mutate(SpecID = paste(id, Scan, sep = "_")) %>%

  # Trim the prefix and suffix from the peptide sequence, but retain modifications
  mutate(Peptide_Sequence_with_Mods = trim_peptide_sequence(Peptide)) %>%
  
  # Label protein type (contaminant, reverse, forward)
  mutate(Protein_Type = case_when(
    str_detect(Protein, "Contaminant") ~ "None",
    str_detect(Protein, "^XXX_") ~ "Reversed",
    TRUE ~ "Forward"))
    
head(unfiltered_results)

edata <- distinct(unfiltered_results, SpecID, Peptide_Sequence_with_Mods, 
                  MSGFDB_SpecEValue, Protein_Type, StatMomentsArea, id) %>%

  dplyr::rename(processed_dobj_id = "id") %>%
  # For each SpecID (unique scan), select the peptide-spectrum match with the smallest MSGFDB_SpecEValue
  group_by(SpecID) %>% 
  slice_min(MSGFDB_SpecEValue, with_ties = FALSE, n = 1) %>% 
  ungroup()

head(edata)

# Confirm that there is a single peptide identification per scan
stopifnot("Still more than one identification per scan" = length(unique(edata$SpecID)) == length(edata$SpecID))

forward_peptides <- filter(edata, Protein_Type == "Forward") %>% select(-Protein_Type)

head(forward_peptides)

reversed_peptides <- filter(edata, Protein_Type == "Reversed") %>% select(-Protein_Type)

head(reversed_peptides)

# Take an initial guess at a log10 spectral probability filter value
initial_specprob_filter = -15

optimized_filter <- optimize_spec_filt(initial_specprob_filter, forward_peptides, reversed_peptides)$par

paste("Optimal log10 filter value:", optimized_filter)

peps_for_plot <- bind_rows(forward_peptides, reversed_peptides, .id = "direction") %>%
  mutate(direction = case_when(direction == 1 ~ "forward", direction == 2 ~ "reverse"),
         direction = factor(direction))

main_plot <- ggplot(peps_for_plot) +
  geom_histogram(aes(x = MSGFDB_SpecEValue, fill = direction), bins = 50, alpha = 0.5, position = "identity") + 
  geom_vline(xintercept = 10 ^ optimized_filter) +
  scale_fill_manual(values = c("forward" = "seagreen", "reverse" = "orangered")) +
  ylab("Number of peptide-spectrum matches") +
  ggtitle("Impact of spectral probability filter")

zoom_plot <- peps_for_plot %>%
  # subset data - zoom in 
  filter(MSGFDB_SpecEValue < 2e-9) %>%
  ggplot() +
    geom_histogram(aes(x = MSGFDB_SpecEValue, fill = direction), bins = 30, alpha = 0.5, position = "identity") + 
    geom_vline(xintercept = 10 ^ optimized_filter) +
    scale_fill_manual(values = c("forward" = "seagreen", "reverse" = "orangered")) +
    theme(legend.position = "none", axis.title.x = element_blank(), axis.title.y = element_blank())
    
vp <- viewport(width = 0.4, height = 0.45, x = 0.6, y = 0.65)

main_plot
print(zoom_plot, vp = vp)

forward_peptides <- filter(forward_peptides, MSGFDB_SpecEValue < 10 ^ optimized_filter)

reversed_peptides <- filter(reversed_peptides, MSGFDB_SpecEValue < 10 ^ optimized_filter)

# Calculate spectral FDR
f_spec <- length(unique(forward_peptides$SpecID))
r_spec <- length(unique(reversed_peptides$SpecID))

fdr_spec <- ifelse(f_spec == 0 & r_spec == 0,
                   1,
                   (2 * r_spec) / (f_spec + r_spec))

# Calculate peptide FDR
f_pep <- length(unique(forward_peptides$Peptide_Sequence_with_Mods))
r_pep <- length(unique(reversed_peptides$Peptide_Sequence_with_Mods))

fdr_pep <- ifelse(f_pep == 0 & r_pep == 0,
                  1,
                  r_pep / (f_pep + r_pep))

paste("Spectral FDR:", fdr_spec)
paste("Peptide FDR:", fdr_pep)

forward_peptides <- forward_peptides %>%
  select(-c(SpecID, MSGFDB_SpecEValue)) %>%
  group_by(processed_dobj_id, Peptide_Sequence_with_Mods) %>%
  mutate(StatMomentsArea = sum(StatMomentsArea)) %>% 
  ungroup() %>%
  distinct(processed_dobj_id, Peptide_Sequence_with_Mods, StatMomentsArea)

head(forward_peptides)

ggplot(forward_peptides) +
  geom_boxplot(aes(x = processed_dobj_id, y = StatMomentsArea)) +
  labs(x = "Samples", y = "Relative Peptide Abundance (Not Normalized)", title = "Peptide relative abundances by sample") +
  theme(axis.text.x = element_blank())

forward_peptides <- forward_peptides %>%
  mutate(StatMomentsAreaLog2 = log2(StatMomentsArea)) %>%
  group_by(processed_dobj_id) %>%
  mutate(group_medians = median(StatMomentsAreaLog2)) %>%
  ungroup() %>%
  distinct()

# Calculate data wide median
all_data_median <- median(forward_peptides$StatMomentsAreaLog2)

forward_peptides <- forward_peptides %>%
  # Subtract the sample wise median from each value within its group,
  # then add back in the data wide median to avoid negative abundances
  mutate(StatMomentsAreaLogNorm = StatMomentsAreaLog2 - group_medians + all_data_median)

ggplot(forward_peptides) +
  geom_boxplot(aes(x = processed_dobj_id, y = StatMomentsAreaLogNorm)) +
  labs(x = "Samples", y = "Relative Peptide Abundance (Normalized)", title = "Peptide relative abundances by sample") +
  theme(axis.text.x = element_blank())

peptide_protein_mapping <- unfiltered_results %>%
  filter(Peptide_Sequence_with_Mods %in% forward_peptides$Peptide_Sequence_with_Mods) %>%
  distinct(Peptide_Sequence_with_Mods, Protein)

head(peptide_protein_mapping)

workflow_inputs <- (version_output %>% 
  select(-c('has_output','version')) %>% 
  unnest(has_input))$has_input

annotation_input_df <- data_objects %>%
  filter(data_object_type == "Functional Annotation GFF") %>%
  filter(biosample_id %in% proteomic_output_df$biosample_id) %>%
  filter(id %in% workflow_inputs) %>% 
  distinct(biosample_id, id, data_object_type, url)

head(annotation_input_df)

paste("Reading from", annotation_input_df$url[2])

head(gff_extract_features(annotation_input_df$url[2]))

gene_mapping <- distinct(annotation_input_df, id, url)

gene_mapping <- iterate_file_extract(
  input_df = gene_mapping,
  identifier_col = "id",
  url_col = "url",
  extract_cols = c("ID", "product", "product_source"),
  filter_col = "ID",
  filter_values = unique(peptide_protein_mapping$Protein),
  file_type = "gff"
)

# Merge with protein mapping information
# Drop data object ID since mappings are not dataset specific
annotation_mapping <- inner_join(gene_mapping, peptide_protein_mapping, by = join_by(ID == Protein)) %>%
  dplyr::rename(Protein = "ID") %>%
  distinct(Peptide_Sequence_with_Mods, Protein, product, product_source)

head(annotation_mapping)

# Add counts for use in razor logic function
# annotation_mapping has already been through distinct() so pairs (rows) are unique
annotation_mapping <- annotation_mapping %>%

  # Count the number of proteins that each peptide maps to
  group_by(Peptide_Sequence_with_Mods) %>%
  mutate(prot_count = n()) %>%
  ungroup() %>%
  
  # Count the number of REDUNDANT and UNIQUE peptides that each protein maps to
  group_by(Protein) %>%
  mutate(redundant_pep_count = sum(prot_count > 1),
         unique_pep_count = sum(prot_count == 1)) %>%
  ungroup()

head(annotation_mapping)

# Get razor mappings (long format)
razor_mapping <- get_razor_protein(annotation_mapping)

# Roll up - concatenate razor protein results (one row per peptide)
razor_mapping <- razor_mapping %>%
  group_by(Peptide) %>%
  mutate(Razor_Protein  = paste(Razor_Protein, collapse = ", "),
         product        = paste(product, collapse = ", "),
         product_source = paste(product_source, collapse = ", ")) %>%
  ungroup() %>%
  distinct()

head(razor_mapping)

forward_peptides <- forward_peptides %>%
  right_join(razor_mapping, by = join_by(Peptide_Sequence_with_Mods == Peptide)) %>%
  distinct()

head(forward_peptides)

protein_abundances <- forward_peptides %>%

  # De-log transform the peptide abundances
  mutate(StatMomentsAreaNorm = 2 ^ StatMomentsAreaLogNorm) %>%

  # Sum peptide abundances for each protein
  group_by(processed_dobj_id, Razor_Protein) %>%
  mutate(StatMomentsAreaNormSum = sum(StatMomentsAreaNorm)) %>%
  ungroup() %>%

  # Log transform the rolled up protein abundances (final result - StatMomentsAreaLogNormSum)
  distinct(processed_dobj_id, product, product_source, Razor_Protein, StatMomentsAreaNormSum) %>%
  mutate(StatMomentsAreaLogNormSum = log2(StatMomentsAreaNormSum)) %>%
  select(-StatMomentsAreaNormSum)

head(protein_abundances)

aggregated_proteomic_output <- protein_abundances %>%
  select(processed_dobj_id, Razor_Protein, StatMomentsAreaLogNormSum) %>%
  pivot_wider(names_from = "processed_dobj_id", values_from = "StatMomentsAreaLogNormSum")

head(aggregated_proteomic_output)

biosample_metadata <- get_results_by_id(collection = "biosample_set",
                                        match_id_field = "id", 
                                        id_list = proteomic_output_df$biosample_id, 
                                        fields = "id,depth.has_numeric_value") %>%
  # Cleanup json output
  unnest(depth) %>%
  dplyr::rename(biosample_id = id,
                depth_m = has_numeric_value) %>%
  # Add data object IDs to connect biosample metadata to processed results
  left_join(select(proteomic_output_df, processed_dobj_id, biosample_id), by = join_by("biosample_id"))

head(biosample_metadata)

processed_dobj_id	name	data_object_type	url	biosample_id	in_manifest
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
nmdc:dobj-11-7wk9es16	nmdc_dobj-11-4kwfhk83_kaiko_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-4kwfhk83_kaiko_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-0jw5n594	NULL
nmdc:dobj-11-ct23bs36	nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/2/nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-0jw5n594	NULL
nmdc:dobj-11-pxnhdj84	nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-4kwfhk83_nmdc_dobj-11-haabxj14_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-0jw5n594	nmdc:manif-11-7796sg87
nmdc:dobj-11-mg2dnn24	nmdc_dobj-11-k9e5nm54_nmdc_dobj-11-tt8ykk73_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/2/nmdc_dobj-11-k9e5nm54_nmdc_dobj-11-tt8ykk73_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-13145k83	NULL
nmdc:dobj-11-xm1yjv87	nmdc_dobj-11-k9e5nm54_nmdc_dobj-11-tt8ykk73_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-k9e5nm54_nmdc_dobj-11-tt8ykk73_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-13145k83	nmdc:manif-11-7796sg87
nmdc:dobj-11-1rynqk87	nmdc_dobj-11-k9e5nm54_kaiko_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-k9e5nm54_kaiko_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-13145k83	NULL

processed_dobj_id	name	data_object_type	url	biosample_id	in_manifest
<chr>	<chr>	<chr>	<chr>	<chr>	<chr>
nmdc:dobj-11-2k6s1505	nmdc_dobj-11-ms76kj12_nmdc_dobj-11-c5av0320_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-ms76kj12_nmdc_dobj-11-c5av0320_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-8e1rjf10	nmdc:manif-11-7796sg87
nmdc:dobj-11-3ak2bc31	nmdc_dobj-11-steksn39_nmdc_dobj-11-d3ahex66_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-steksn39_nmdc_dobj-11-d3ahex66_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-7qxjvr77	nmdc:manif-11-7796sg87
nmdc:dobj-11-4bwzhs42	nmdc_dobj-11-vs45wb84_nmdc_dobj-11-r3650k33_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-vs45wb84_nmdc_dobj-11-r3650k33_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-an5fyr03	nmdc:manif-11-7796sg87
nmdc:dobj-11-7psagy79	nmdc_dobj-11-erc74h90_nmdc_dobj-11-sekqwq68_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-erc74h90_nmdc_dobj-11-sekqwq68_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-3pvn5d70	nmdc:manif-11-7796sg87
nmdc:dobj-11-7x21d450	nmdc_dobj-11-wtsjqv17_nmdc_dobj-11-2hb27y88_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-wtsjqv17_nmdc_dobj-11-2hb27y88_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-937ebx72	nmdc:manif-11-7796sg87
nmdc:dobj-11-bxyvx506	nmdc_dobj-11-xvmb4058_nmdc_dobj-11-bvya7a43_msgfplus_syn_PlusSICStats.txt	Unfiltered Metaproteomics Results	https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-xvmb4058_nmdc_dobj-11-bvya7a43_msgfplus_syn_PlusSICStats.txt	nmdc:bsm-13-1p0tct86	nmdc:manif-11-7796sg87

id	manifest_category	type	description
<chr>	<chr>	<chr>	<chr>
nmdc:manif-11-7796sg87	instrument_run	nmdc:Manifest	collection of metaproteomic analyses from the same instrument run nmdc:sty-11-aygzgv51

ResultID	Scan	FragMethod	SpecIndex	Charge	PrecursorMZ	DelM	DelM_PPM	MH	Peptide	⋯	PeakMaxIntensity	PeakSignalToNoiseRatio	FWHMInScans	PeakArea	ParentIonIntensity	ParentIonMZ	StatMomentsArea	PeakScanStart	PeakScanEnd	PeakWidthMinutes
<dbl>	<dbl>	<chr>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<chr>	⋯	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	36947	HCD	1	3	968.4745	-0.00469	-1.61740	2902.410	K.NYSPYYNTIDDLKDQIVDLTVGNNK.T	⋯	6619700	106.000	62	503970000	4468100	968.47	453600000	36866	37232	0
2	30576	HCD	2	3	1017.1652	-0.01640	-5.38151	3048.494	K.VLYDAEISQIHQSVTDTNVILSMDNSR.N	⋯	2595300	343.300	115	227780000	2481100	1017.17	206570000	30497	30787	0
3	21671	HCD	3	3	1050.4807	0.02323	7.38083	3148.401	K.NADSTLHTVTSGTAEGGESGTVFDSSYMAAGK.T	⋯	2393100	165.900	14	51712000	1454200	1050.48	53478000	21598	21730	0
4	30168	HCD	4	2	1097.4955	-0.01073	-4.89393	2193.994	K.SAYPGQITSNMFCAGYLEGGK.D	⋯	10190000	203.600	146	1796600000	1814600	1097.50	1745300000	29842	30311	0
5	29918	HCD	5	2	1098.0001	-0.00482	-2.19575	2193.994	K.SAYPGQITSNMFCAGYLEGGK.D	⋯	12435000	420.900	150	2135100000	3193700	1098.00	2086400000	29842	30333	0
6	19208	HCD	6	3	856.4273	-0.01420	-5.53640	2566.278	R.SKEEAEALYHSKYEELQVTVGR.H	⋯	1158500	9.597	68	107740000	1045400	856.43	82117000	19134	19330	0

Charge	Scan	Peptide	Protein	MSGFDB_SpecEValue	StatMomentsArea	id	SpecID	Peptide_Sequence_with_Mods	Protein_Type
<dbl>	<dbl>	<chr>	<chr>	<dbl>	<dbl>	<chr>	<chr>	<chr>	<chr>
3	36947	K.NYSPYYNTIDDLKDQIVDLTVGNNK.T	Contaminant_K1C9_HUMAN	1.4548e-31	453600000	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2k6s1505_36947	NYSPYYNTIDDLKDQIVDLTVGNNK	None
3	30576	K.VLYDAEISQIHQSVTDTNVILSMDNSR.N	Contaminant_K22E_HUMAN	7.1111e-31	206570000	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2k6s1505_30576	VLYDAEISQIHQSVTDTNVILSMDNSR	None
3	21671	K.NADSTLHTVTSGTAEGGESGTVFDSSYMAAGK.T	nmdc:wfmgan-11-a71gkg84.1_003113_596_1117	1.3962e-28	53478000	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2k6s1505_21671	NADSTLHTVTSGTAEGGESGTVFDSSYMAAGK	Forward
2	30168	K.SAYPGQITSNMFCAGYLEGGK.D	Contaminant_TRYP_BOVIN	1.8971e-27	1745300000	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2k6s1505_30168	SAYPGQITSNMFCAGYLEGGK	None
2	29918	K.SAYPGQITSNMFCAGYLEGGK.D	Contaminant_TRYP_BOVIN	5.2953e-27	2086400000	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2k6s1505_29918	SAYPGQITSNMFCAGYLEGGK	None
3	19208	R.SKEEAEALYHSKYEELQVTVGR.H	Contaminant_K22E_HUMAN	5.2963e-27	82117000	nmdc:dobj-11-2k6s1505	nmdc:dobj-11-2k6s1505_19208	SKEEAEALYHSKYEELQVTVGR	None

Proteomic Data Aggregation and Visualization¶

1) Assess background information and collect data for an example study of riverbed sediment along the Columbia River¶

2) Apply a spectral probability filter across the data that optimizes the number of identifications for an FDR of 0.05¶

3) Collapse to unique peptides and normalize their relative abundance¶

4) Extract functional gene annotations for proteins¶

5) Generate annotation and protein mappings for peptides using "Razor" strategy¶

6) Perform protein rollup and summarize into a final aggregated table of relative protein abundance¶

Final aggregated table of relative protein abundance¶

A tibble: 6 × 6
SpecID	Peptide_Sequence_with_Mods	MSGFDB_SpecEValue	Protein_Type	StatMomentsArea	processed_dobj_id
<chr>	<chr>	<dbl>	<chr>	<dbl>	<chr>
nmdc:dobj-11-2k6s1505_10005	PPDERERSEEAEKRDEERDRVRDELLAGAEEGEPR	4.9943e-07	Forward	1238500000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10008	LRAGSEPR	1.1726e-07	Forward	123460000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10009	WAKEIENQK	2.4739e-07	Forward	57316000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10011	CWRYLISN	4.1349e-07	Forward	6632300000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10013	EFVDIISYMENENHSDIEYPLLYKWDSKSTVINR	1.0215e-07	Reversed	737140000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10014	WLIKELDDTK	3.5021e-07	Forward	82751000	nmdc:dobj-11-2k6s1505

A tibble: 6 × 5
SpecID	Peptide_Sequence_with_Mods	MSGFDB_SpecEValue	StatMomentsArea	processed_dobj_id
<chr>	<chr>	<dbl>	<dbl>	<chr>
nmdc:dobj-11-2k6s1505_10005	PPDERERSEEAEKRDEERDRVRDELLAGAEEGEPR	4.9943e-07	1238500000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10008	LRAGSEPR	1.1726e-07	123460000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10009	WAKEIENQK	2.4739e-07	57316000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10011	CWRYLISN	4.1349e-07	6632300000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10014	WLIKELDDTK	3.5021e-07	82751000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10022	IDGIDDVK	1.4727e-07	230150000	nmdc:dobj-11-2k6s1505

A tibble: 6 × 5
SpecID	Peptide_Sequence_with_Mods	MSGFDB_SpecEValue	StatMomentsArea	processed_dobj_id
<chr>	<chr>	<dbl>	<dbl>	<chr>
nmdc:dobj-11-2k6s1505_10013	EFVDIISYMENENHSDIEYPLLYKWDSKSTVINR	1.0215e-07	737140000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10019	QWHPNFLR	3.4485e-07	3220300000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10026	LAEREGGAR	1.4512e-08	960480000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10028	ISTYIDEK	6.8431e-09	125230000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10029	AIEEFFNMHCAFFFSVR	2.4556e-07	65250000	nmdc:dobj-11-2k6s1505
nmdc:dobj-11-2k6s1505_10030	NFFGGMPRGK	3.3443e-07	54785000	nmdc:dobj-11-2k6s1505

A tibble: 6 × 3
processed_dobj_id	Peptide_Sequence_with_Mods	StatMomentsArea
<chr>	<chr>	<dbl>
nmdc:dobj-11-2k6s1505	VYLGAETTR	323402000
nmdc:dobj-11-2k6s1505	DTLPHTVTSGTGPTDPNSAK	52891000
nmdc:dobj-11-2k6s1505	SQVSEGSSLADGVK	1259380000
nmdc:dobj-11-2k6s1505	TLSDYNIQK	250540000
nmdc:dobj-11-2k6s1505	SALQNAASIAK	155670000
nmdc:dobj-11-2k6s1505	IETGELAGYK	43413000