import re
import math
import requests
import warnings

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gffpandas.gffpandas as gffpd

from io import StringIO
from scipy.stats import hypergeom, false_discovery_control
from pathlib import Path
from urllib.parse import parse_qsl, urlencode

def get_reports_biosample(biosample_id):
    """
    Get the protein reports, GFF and FASTA urls for the given biosample.
    Result is a dictionary with a key for the GFF, FASTA and Protein Reports. 

    We first find the studyid associated with the biosample, fetch the object urls associated with the studyid,
    and then subset those to the specific biosample given.

    Keyword arguments:
    biosample_id -- the NMDC biosample ID of interest. Biosample MUST have an associated protein report!

    """

    biosample_id_ = biosample_id.replace(':', '%3A')
    url = f'https://api.microbiomedata.org/biosamples/{biosample_id_}'
    resp = requests.get(url)
    data = resp.json()
    studyids = data['associated_studies']
    if len(studyids) > 1:
        print("More than one study associated with biosample.")
        print(studyids)
        print(f"Selecting {studyids[0]}")
    studyid = studyids[0]
    all_results = get_reports_studyid(studyid)
    all_results = all_results[f'biosample_{biosample_id}']

    gffs = all_results['GFF']
    gffs = [x for x in gffs if 'Functional Annotation for ' in x['description']]
    assert len(gffs) == 1
    all_results['FASTA'] = all_results['FASTA'][0]
    all_results['GFF'] = gffs[0]
    return all_results



def get_reports_studyid(studyid):

    """
    Get all protein report urls for all biosamples from the given studyid.
    Result is a dictionary with a key for every biosample that has a protein report.

    Keyword arguments:
    studyid -- the NMDC study ID of interest.

    """

    studyid = studyid.replace(':', '%3A')
    url = f'https://api.microbiomedata.org/data_objects/study/{studyid}'
    all_results = dict()
    resp = requests.get(url)
    data = resp.json()
    for biosample in data:
        biosample_id = biosample['biosample_id']
        new_entry = dict()
        new_entry['protein_reports'] = []
        new_entry['FASTA'] = []
        new_entry['GFF'] = []
        data_objects = biosample['data_objects']
        for dobj in data_objects:
            if 'data_object_type' in dobj.keys():
                if dobj['data_object_type'] == 'Protein Report':
                    new_entry['protein_reports'] += [dobj]
                if dobj['data_object_type'] == 'Annotation Amino Acid FASTA': # Excludes contaminant FASTAs
                    new_entry['FASTA'] += [dobj]
                if 'GFF' in dobj['data_object_type']:
                    new_entry['GFF'] += [dobj]
        if len(new_entry['protein_reports']) > 0:
            check_pr_fasta(new_entry)
            all_results[f'biosample_{biosample_id}'] = new_entry
    return all_results

def check_pr_fasta(new_entry):
    """ 
    Interal function to check that:
    1. There's only one FASTA associated with a biosample and
    2. That FASTA has ID listed in the name of protein report for that biosample.

    """
    fasta_obj_ids = list(set([_extract_fasta_id(x['name']) for x in new_entry['protein_reports']]))
    assert len(new_entry['FASTA']) == 1
    assert len(fasta_obj_ids) == 1
    assert new_entry['FASTA'][0]['id'] == fasta_obj_ids[0]

def _extract_fasta_id(protein_report_name):
    """ 
    Interal function to extract the fasta nmdc ID from the protein report name

    """
    try:
        out = re.search('nmdc_dobj.*_nmdc_dobj(.*)_Protein_Report.tsv', protein_report_name).group(1)
        out = f'nmdc:dobj{out}'
    except:
        out = ''
    return out

test = get_reports_biosample('nmdc:bsm-13-bgefg837')

test

{'protein_reports': [{'id': 'nmdc:dobj-11-xnhnfq72',
   'type': 'nmdc:DataObject',
   'name': 'nmdc_dobj-11-9gcej008_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'description': 'Aggregated protein lists from MSGF+ search results filtered to ~5% FDR',
   'file_size_bytes': 5470064,
   'md5_checksum': '92a123898487447501b27ff97847df23',
   'data_object_type': 'Protein Report',
   'url': 'https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-9gcej008_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'in_manifest': ['nmdc:manif-11-91gc1f77'],
   'data_category': 'processed_data'},
  {'id': 'nmdc:dobj-11-hw6eyg62',
   'type': 'nmdc:DataObject',
   'name': 'nmdc_dobj-11-421jnb25_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'description': 'Aggregated protein lists from MSGF+ search results filtered to ~5% FDR',
   'file_size_bytes': 5486468,
   'md5_checksum': '80f3745bd6b853ad8f842756106d03b6',
   'data_object_type': 'Protein Report',
   'url': 'https://nmdcdemo.emsl.pnnl.gov/proteomics/results/nmdc_dobj-11-421jnb25_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'in_manifest': ['nmdc:manif-11-91gc1f77'],
   'data_category': 'processed_data'},
  {'id': 'nmdc:dobj-11-yw81tm10',
   'type': 'nmdc:DataObject',
   'name': 'nmdc_dobj-11-421jnb25_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'description': 'Aggregated protein lists from MSGF+ search results filtered to ~5% FDR',
   'data_object_type': 'Protein Report',
   'file_size_bytes': 62324237,
   'md5_checksum': '65049e1b2ce7a5c407c5121bc9e405b0',
   'url': 'https://nmdcdemo.emsl.pnnl.gov/proteomics/results/2/nmdc_dobj-11-421jnb25_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'was_generated_by': 'nmdc:wfmp-11-fz8k5p27.2',
   'data_category': 'processed_data'},
  {'id': 'nmdc:dobj-11-02f4sr80',
   'type': 'nmdc:DataObject',
   'name': 'nmdc_dobj-11-9gcej008_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'description': 'Aggregated protein lists from MSGF+ search results filtered to ~5% FDR',
   'data_object_type': 'Protein Report',
   'file_size_bytes': 57999620,
   'md5_checksum': '496450d6f24f5c98730d1d7bf7f904a2',
   'url': 'https://nmdcdemo.emsl.pnnl.gov/proteomics/results/2/nmdc_dobj-11-9gcej008_nmdc_dobj-11-j5mh8584_Protein_Report.tsv',
   'was_generated_by': 'nmdc:wfmp-11-x0zhd078.2',
   'data_category': 'processed_data'}],
 'FASTA': {'id': 'nmdc:dobj-11-j5mh8584',
  'type': 'nmdc:DataObject',
  'name': 'nmdc_wfmgan-11-pmh0a992.1_proteins.faa',
  'description': 'FASTA Amino Acid File for nmdc:wfmgan-11-pmh0a992.1',
  'file_size_bytes': 569196371,
  'md5_checksum': 'd23180f55fe3d7044169b8a3cc82a42d',
  'data_object_type': 'Annotation Amino Acid FASTA',
  'url': 'https://data.microbiomedata.org/data/nmdc:omprc-13-4kkhhh55/nmdc:wfmgan-11-pmh0a992.1/nmdc_wfmgan-11-pmh0a992.1_proteins.faa',
  'data_category': 'processed_data'},
 'GFF': {'id': 'nmdc:dobj-11-jq8ct440',
  'type': 'nmdc:DataObject',
  'name': 'nmdc_wfmgan-11-pmh0a992.1_functional_annotation.gff',
  'description': 'Functional Annotation for nmdc:wfmgan-11-pmh0a992.1',
  'file_size_bytes': 655257131,
  'md5_checksum': '36bdc0dcb731ae0126f436305e080edb',
  'data_object_type': 'Functional Annotation GFF',
  'url': 'https://data.microbiomedata.org/data/nmdc:omprc-13-4kkhhh55/nmdc:wfmgan-11-pmh0a992.1/nmdc_wfmgan-11-pmh0a992.1_functional_annotation.gff',
  'data_category': 'processed_data'}}

def read_protein_report(results_dict, workflow_type = 'matched_metagenome'):
    """ 
    Function to read the protein reports. 

    Keyword arguments:
    results_dict -- The output of the function `get_reports_biosample`, which contains the urls for the protein reports.

    """
    # all_results = dict()
    dfs = []
    by_name = dict()
    ## Organize all results by raw file:
    for protein_report in results_dict['protein_reports']:
        if 'was_generated_by' in protein_report.keys():
            wrkflw_id = protein_report['was_generated_by']
            url = f'https://api.microbiomedata.org/objects/{wrkflw_id.replace("nmdc:", "")}'
            resp = requests.get(url)
            data = resp.json()
            protein_report['workflow_type'] = data['metaproteomics_analysis_category']
            protein_report['completion_date'] = data['ended_at_time']
        else:
            protein_report['workflow_type'] = ''
            protein_report['completion_date'] = ''
        name = protein_report['name']
        if name not in by_name.keys():
            by_name[name] = [protein_report]
        else:
            by_name[name] += [protein_report]  

   
    to_read = dict()
    workflow_present = [any([y['workflow_type'] == workflow_type for y in xx]) for xx in [x for x in by_name.values()]]
    lens = [len(x) for x in by_name.values()]
    ## If only ONE report is present per raw file, we use those
    if all([y == 1 for y in lens]):
        for name in by_name.keys():
            for protein_report in by_name[name]:
                to_read[name] = protein_report
    ## If there are multiple protein reports per raw file, we check if the request workflow type is present for each raw file. If so, we use the latest report for that workflow type.
    elif all(workflow_present):
        for name in by_name.keys():
            for protein_report in by_name[name]:
                if protein_report['workflow_type'] == workflow_type:
                    if name in to_read.keys():
                        ## If multiple protein reports from the same workflow type AND raw file are found, use the latest one!
                        if to_read[name]['completion_date'] <= protein_report['completion_date']:
                            to_read[name] = protein_report
                    else:
                        to_read[name] = protein_report
        
    ## If multiple report per raw file exist, AND if not all raw files have a protein report with the requested workflow type, THEN we ask for more input from the user:
    else:
        print(f"Multiple protein reports per raw file, and not all have the requested workflow type ({workflow_type}). Please specify another metaproteomic_workflow_type to use.")
        for name in by_name.keys():
            workflow_types = [y['workflow_type'] for y in by_name[name]]
            print(f'{name} has {workflow_types} workflow_types')
        out = None
    for protein_report in to_read.values():
        url = protein_report['url']
        resp = requests.get(url)
        data = StringIO(resp.text)

        df = pd.read_csv(data, sep="\t")
        dfs = dfs + [df]
        out = pd.concat(dfs)
        if 'best_protein' in out.columns and 'razor_protein' not in out.columns:
            out['razor_protein'] = out['best_protein']
    return out

def read_gff(results_dict):
    """ 
    Function to read the GFF. 

    Keyword arguments:
    results_dict -- The output of the function `get_reports_biosample`, which contains the urls for the GFF.

    """
    warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
    url = results_dict['GFF']['url']
    resp = requests.get(url)
    local_temp = Path('./temp_gff.gff')
    with local_temp.open('w') as file:
        for line in resp:
            file.write(line.decode('utf-8'))

    local_temp = Path('temp_gff.gff')
    # local_temp_ = Path('temp_gff_.gff')
    df = gffpd.read_gff3(local_temp)
    df = df.attributes_to_columns()
    df = df.drop('attributes', axis=1)
    local_temp.unlink()
    return df

protein_report = read_protein_report(test)
functional_gff = read_gff(test)

protein_report[0:7]

functional_gff[0:7]

def find_overrep(protein_report, gff, annotation_category):
    """ 
    Given an `annotation_category` (like pfam), this function
    We count the number of detected proteins associated with each of the annotations from the given `annotation_category`.
    Then for each annotation, we idealize the process of protein identification as random sampling without replacement, and
    use the hypergeometric distribution to estimate the probability that we detect at least the observed number of proteins from that annotation.
    The results contain 4 columns, the annotation name, a fraction showing the number of annotated proteins detected over the total number of annotated proteins, 
    the pvalue and finally the BH adjusted pvalue.

    Keyword arguments:
    protein_report -- The protein report (pandas dataframe). The output of the function `read_protein_report`.
    gff -- The gff (pandas dataframe). The output of the function `read_gff`.
    annotation_category -- Any column name from the gff.

    """

    if (annotation_category == 'attributes'):
        avail = [column for column in gff.columns if column not in ['attributes']]
        print("The attributes in the gff have been parsed by gffpandas. Each category can be found in a single column.")
        print(f'These are the possible columns {avail}')
        return None
    functional_gff = gff
    gff_annotations = functional_gff[f'{annotation_category}'].tolist()
    gff_annotations = [x for x in gff_annotations if x is not None]
    gff_annotations = [str(x) for x in gff_annotations]
    annotation_counts = dict()

    for annotation in gff_annotations:
        if annotation is not None:
            annotation_split = annotation.split(',')
            for single_annotation in annotation_split:
                if single_annotation not in annotation_counts.keys():
                    annotation_counts[single_annotation] = 1
                else:
                    annotation_counts[single_annotation] += 1

    # a = list(yy.ID)
    functional_gff.index = functional_gff.ID
    identified_proteins = list(set([x for x in protein_report.razor_protein if 'Contaminant' not in str(x)]))
    identified_annotations = functional_gff.loc[identified_proteins][f'{annotation_category}'].tolist()
    identified_annotations = [x for x in identified_annotations if x is not None]
    identified_annotations = [str(x) for x in identified_annotations]

    if [x for x in identified_annotations if x is not None] == []:
        print(f'No proteins with annotations from the supplied colum ({annotation_category}) were identified in the protein report.')

    id_annotation_counts = dict()
    for annotation in identified_annotations:
        if annotation is not None:
            annotation_split = annotation.split(',')
            for single_annotation in annotation_split:
                if single_annotation not in id_annotation_counts.keys():
                    id_annotation_counts[single_annotation] = 1
                else:
                    id_annotation_counts[single_annotation] += 1

    total_id_proteins = len(identified_proteins)
    total_proteins = len(functional_gff)
    overrep_results = dict()

    for annotation in id_annotation_counts.keys():
        M, m, N, x = total_proteins, annotation_counts[annotation], total_id_proteins, id_annotation_counts[annotation]
        pval = float(hypergeom.sf(x-1, M, m, N))
        ann_results = dict()
        ann_results['pvalue'] = pval
        ann_results['ratio'] = f'{x}/{m}'
        overrep_results[annotation] = ann_results

    adj_pvals = false_discovery_control([x['pvalue'] for x in overrep_results.values()], method = 'BH')

    for annotation, adj_pval in zip(id_annotation_counts.keys(), adj_pvals):
        overrep_results[annotation]['adj_pval'] = adj_pval

    overrep_results_df = pd.DataFrame({'annotation' : list(overrep_results.keys()),
                                       'ratio' : [x['ratio'] for x in overrep_results.values()],
                                       'pvalue' : [x['pvalue'] for x in overrep_results.values()],
                                       'adj_pvalue' : [x['adj_pval'] for x in overrep_results.values()]})
    
    return overrep_results_df

overrep_results_df_pfam = find_overrep(protein_report, functional_gff, 'pfam')
overrep_results_df_pfam

overrep_results_df_ko = find_overrep(protein_report, functional_gff, 'ko')
overrep_results_df_ko[0:7]

functional_gff.columns

Index(['seq_id', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase',
       'ID', 'Parent', 'accession', 'average_repeat_length',
       'average_spacer_length', 'bound_moiety', 'cath_funfam', 'codon', 'cog',
       'e-value', 'ec_number', 'intron_end', 'intron_start', 'ko',
       'median_repeat_length', 'median_spacer_length', 'model', 'model_end',
       'model_start', 'ncRNA_class', 'note', 'number_of_repeats', 'partial',
       'pfam', 'product', 'product_source', 'regulatory_class', 'smart',
       'start_type', 'superfamily', 'tigrfam', 'translation_table',
       'used_search_mode'],
      dtype='object')

overrep_results_df_superfam = find_overrep(protein_report, functional_gff, 'superfamily')
overrep_results_df_superfam

def plot_results(results, title = '', N = 20):
    """ 
    Given the results table from the function `find_overrep`, we plot the top N most significant results.

    Keyword arguments:
    results -- The output of the function `find_overrep`.
    N -- Number of annotations to plot.

    """
    results = results.sort_values('pvalue')
    results = results[0:N]
    # counts = [int(r.split('/')[0])/int(r.split('/')[1]) for r in results.ratio]
    counts = [int(r.split('/')[0]) for r in results.ratio]
    fig, (ax1, ax2) = plt.subplots(1,2)
    ax1.barh(results.annotation, counts)
    ax1.set_xlabel("Number of proteins")
    ## Since some pvalues are zero, coming out of the scipy hypergeom cdf, we add a little bit to avoid the singularity
    log_pvals = [-math.log10(pval + 1e-200) for pval in results.adj_pvalue]  
    ax2.barh(results.annotation, log_pvals)
    ax2.set_yticks([])
    ax2.set_xlabel('adj_pvalue')
    max_log, min_log = max(log_pvals), min(log_pvals)
    tick_distance = (max_log - min_log)*0.1
    x_ticks = [max(int(min_log - 0.7*tick_distance), 1), max(int(min_log + 4*tick_distance), 2), max(int(min_log + 8.5*tick_distance), 3)]
    ax2.set_xticks(x_ticks, labels = ['10$^{-' + f'{x_ticks[0]}' + '}$', '10$^{-' + f'{x_ticks[1]}' + '}$', '10$^{-' + f'{x_ticks[2]}' + '}$'], fontsize = 9)
    fig.suptitle(title)
    plt.show()

plot_results(overrep_results_df_pfam, title = "PFAM")

plot_results(overrep_results_df_pfam, title = "PFAM", N = 30)

plot_results(overrep_results_df_ko, title = "KO")

def overlap(terms, gff_df, protein_report):
    """ 
    Given a list of annotation terms, we compute the overlap between all the proteins belonging to the annotations in the list. 
    By 'overlap' between two sets A and B of proteins, we mean the proportion of the union of A and B in common between A and B.
    Ie, |A intersect B|/|A union B|.

    Keyword arguments:
    terms -- A list of terms found in the GFF.
    gff_df -- the functional annotation GFF
    protein_report -- the protein report.

    """
    identified_proteins = list(set([x for x in protein_report.razor_protein if 'Contaminant' not in x]))
    term_members = dict()
    likely_cols = ['ko', 'pfam']

    ## Extract the protein members of the given terms
    for col in set(likely_cols + list(gff_df.columns)):
        if col in gff_df.columns and not all([term in term_members.keys() for term in terms]):
            gff_annotations = [str(x) for x in gff_df[col].tolist()]
            proteins = gff_df['ID'].tolist()
            for annotation, protein in zip(gff_annotations, proteins):
                if annotation is not None:
                    annotation_split = annotation.split(',')
                    for single_annotation in annotation_split:
                        if single_annotation in terms:
                            if single_annotation not in term_members.keys():
                                term_members[single_annotation] = set([protein])
                            else:
                                term_members[single_annotation].add(protein)

    not_found = set(terms) - set(term_members.keys())
    if len(not_found) > 0:
        print(f'Warning, did not find the following terms in the GFF: {not_found}')

    terms = list(term_members.keys())
    overlap = dict()
    for i, term1 in enumerate(terms):
        for term2 in terms[0:i]:
            common_members = term_members[term1].intersection(term_members[term2])
            all_members = term_members[term1].union(term_members[term2])
            x, y = len(common_members)/len(all_members), len(common_members.intersection(identified_proteins))/len(all_members.intersection(identified_proteins))

            if (x > 0):
                lab_x, lab_y = str(round(x, 2)), str(round(y, 2))
            else:
                lab_x, lab_y = '', ''
            overlap[(term1, term2)] = (x, y, lab_x, lab_y)
            overlap[(term2, term1)] = (x, y, lab_x, lab_y)
            overlap[(term1, term1)] = (1, 1, '', '')
            overlap[(term2, term2)] = (1, 1, '', '')
    output_df = pd.DataFrame({'term1': [x[0] for x in overlap.keys()], 'term2': [x[1] for x in overlap.keys()],
                              'overlap_background': [x[0] for x in overlap.values()], 'overlap_identified': [x[1] for x in overlap.values()],
                              'overlap_background_lab': [x[2] for x in overlap.values()], 'overlap_identified_lab': [x[3] for x in overlap.values()]})

    x = output_df.pivot(index = 'term1', columns = 'term2', values = 'overlap_identified')
    labs = output_df.pivot(index = 'term1', columns = 'term2', values = 'overlap_identified_lab')
    sns.clustermap(x, cmap = 'Blues', annot = labs, fmt = '')

overlap(['KO:K04077', 'PF00118', 'PF13620', 'PF13407', 'PF03144', 'KO:K02358', 'KO:K01915', 'KO:K02112'], functional_gff, protein_report)

	DatasetName	razor_protein	Product	EC_Number	pfam	KO	COG	GeneCount	all_proteins	AnnotationList	UniquePeptideCount	SummedSpectraCounts	SummedPeptideMASICAbundances
0	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0015462_505_2979	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF02861,PF07724,PF10431,PF17871	KO:K03696	COG0542	397	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_0018823_20...	7.0	8.0	1.434427e+10
1	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0001435_14413_16890	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF02861,PF07724,PF10431,PF17871	KO:K03696	COG0542	397	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_0669678_1_...	7.0	8.0	1.434427e+10
2	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0004591_5018_7468	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF02861,PF07724,PF10431,PF17871	KO:K03696	COG0542	387	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_0477425_2_...	7.0	8.0	1.498534e+10
3	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0000119_69369_71816	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF02861,PF07724,PF10431,PF17871	KO:K03696	COG0542	387	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_0557305_2_...	7.0	8.0	1.498534e+10
4	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0087757_1_1518	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF07724,PF17871	KO:K03696	COG0542	387	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_0000431_27...	7.0	8.0	1.498534e+10
5	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0002529_5667_8111	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF02861,PF07724,PF10431,PF17871	KO:K03696	COG0542	387	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_1552042_2_...	7.0	8.0	1.498534e+10
6	SpruceW_P4_15A_22Jun17_Pippin_17-04-06	nmdc:wfmgan-11-pmh0a992.1_0002580_865_3312	ATP-dependent Clp protease ATP-binding subunit...	NaN	PF00004,PF02861,PF07724,PF10431,PF17871	KO:K03696	COG0542	387	nmdc:wfmgan-11-pmh0a992.1_0000039_126009_12865...	gene_name=nmdc:wfmgan-11-pmh0a992.1_1163833_2_...	7.0	8.0	1.498534e+10

	seq_id	source	type	start	end	score	strand	ID	Parent	...	pfam	product	product_source	regulatory_class	smart	start_type	superfamily	tigrfam	translation_table	used_search_mode
0	nmdc:wfmgan-11-pmh0a992.1_0000001	GeneMark.hmm-2 v1.25_lic	CDS	2	127	3.81	+	nmdc:wfmgan-11-pmh0a992.1_0000001_2_127	None	...	None	hypothetical protein	Hypo-rule applied	None	None	TTG	None	None	11	None
1	nmdc:wfmgan-11-pmh0a992.1_0000001	Prodigal v2.6.3_patched	CDS	260	1072	41.7	-	nmdc:wfmgan-11-pmh0a992.1_0000001_260_1072	None	...	PF01636	hypothetical protein	Hypo-rule applied	None	None	ATG	56112	None	11	None
2	nmdc:wfmgan-11-pmh0a992.1_0000001	Prodigal v2.6.3_patched	CDS	1507	2487	16.8	+	nmdc:wfmgan-11-pmh0a992.1_0000001_1507_2487	None	...	PF00665,PF09039	putative transposase	KO:K07497	None	None	TTG	46689,53098	None	11	None
3	nmdc:wfmgan-11-pmh0a992.1_0000001	Prodigal v2.6.3_patched	CDS	2682	3215	11.2	+	nmdc:wfmgan-11-pmh0a992.1_0000001_2682_3215	None	...	PF20020	hypothetical protein	Hypo-rule applied	None	None	ATG	None	None	11	None
4	nmdc:wfmgan-11-pmh0a992.1_0000001	Prodigal v2.6.3_patched	CDS	3149	3775	36.0	-	nmdc:wfmgan-11-pmh0a992.1_0000001_3149_3775	None	...	PF00578	peroxiredoxin	COG1225	None	None	GTG	52833	None	11	None
5	nmdc:wfmgan-11-pmh0a992.1_0000001	GeneMark.hmm-2 v1.25_lic	CDS	3772	4971	53.53	-	nmdc:wfmgan-11-pmh0a992.1_0000001_3772_4971	None	...	None	hypothetical protein	Hypo-rule applied	None	None	ATG	None	None	11	None
6	nmdc:wfmgan-11-pmh0a992.1_0000001	Prodigal v2.6.3_patched	CDS	5079	5828	18.9	-	nmdc:wfmgan-11-pmh0a992.1_0000001_5079_5828	None	...	None	hypothetical protein	Hypo-rule applied	None	None	ATG	None	None	11	None

	annotation	ratio	pvalue	adj_pvalue
0	PF00216	66/758	3.854151e-50	6.048053e-49
1	PF13620	275/5737	2.659059e-137	2.260200e-135
2	PF00712	48/273	8.900774e-52	1.441078e-50
3	PF02767	30/258	2.058849e-27	1.409413e-26
4	PF00006	162/1302	1.765648e-145	2.001068e-143
...	...	...	...	...
1015	PF00421	1/2	1.337959e-02	2.745911e-02
1016	PF03950	1/172	6.860287e-01	8.948201e-01
1017	PF09176	2/12	2.843385e-03	6.444842e-03
1018	PF01197	1/218	7.696804e-01	9.622527e-01
1019	PF13699	1/77	4.046462e-01	5.947250e-01

	annotation	ratio	pvalue	adj_pvalue
0	KO:K03530	64/413	8.767441e-65	2.622926e-63
1	KO:K01915	143/1313	2.708959e-120	2.431291e-118
2	KO:K02338	55/517	9.110181e-47	1.486616e-45
3	KO:K02112	74/700	3.946822e-62	1.012078e-60
4	KO:K18118	32/91	7.047339e-46	9.730749e-45
5	KO:K02014	59/3099	4.490584e-12	1.761880e-11
6	KO:K02986	60/293	1.465175e-68	5.009504e-67

	annotation	ratio	pvalue	adj_pvalue
0	47729	66/807	2.089399e-48	3.004217e-47
1	49452	198/4255	5.236634e-97	1.857260e-95
2	56935	758/18787	0.000000e+00	0.000000e+00
3	55979	52/588	3.312849e-40	3.671741e-39
4	52540	1748/78933	0.000000e+00	0.000000e+00
...	...	...	...	...
527	54111	1/63	3.457773e-01	5.802949e-01
528	161077	1/25	1.549627e-01	2.832996e-01
529	103456	1/26	1.606350e-01	2.916649e-01
530	109775	1/18	1.141694e-01	2.203072e-01
531	57038	1/5	3.311408e-02	7.190487e-02