o
    Rŀg                     @   s"  d Z ddlZddlZddlmZ ddlmZ ddlmZ G dd deZG dd	 d	Z	G d
d dZ
G dd deZdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Zd(d) Zd*d+ Zd,d- Zd.d/ Zed0krdd1lmZ edd2 dS dS )3aN  Code to work with the sprotXX.dat file from SwissProt.

https://web.expasy.org/docs/userman.html

Classes:
 - Record             Holds SwissProt data.
 - Reference          Holds reference data from a SwissProt record.

Functions:
 - read               Read one SwissProt record
 - parse              Read multiple SwissProt records

    N)Position)
SeqFeature)SimpleLocationc                       s&   e Zd ZdZdd fdd
Z  ZS )SwissProtParserErrorz1An error occurred while parsing a SwissProt file.Nlinec                   s   t  j|  || _dS )z=Create a SwissProtParserError object with the offending line.N)super__init__r   )selfr   args	__class__ J/var/www/html/myenv/lib/python3.10/site-packages/Bio/SwissProt/__init__.pyr	      s   
zSwissProtParserError.__init__)__name__
__module____qualname____doc__r	   __classcell__r   r   r   r   r      s    r   c                   @      e Zd ZdZdd ZdS )Recorda	  Holds information from a SwissProt record.

    Attributes:
     - entry_name        Name of this entry, e.g. RL1_ECOLI.
     - data_class        Either 'STANDARD' or 'PRELIMINARY'.
     - molecule_type     Type of molecule, 'PRT',
     - sequence_length   Number of residues.
     - accessions        List of the accession numbers, e.g. ['P00321']
     - created           A tuple of (date, release).
     - sequence_update   A tuple of (date, release).
     - annotation_update A tuple of (date, release).
     - description       Free-format description.
     - gene_name         A list of dictionaries with keys 'Name', 'Synonyms',
                         'OrderedLocusNames' and 'ORFNames'.
     - organism          The source of the sequence.
     - organelle         The origin of the sequence.
     - organism_classification  The taxonomy classification.  List of strings.
       (http://www.ncbi.nlm.nih.gov/Taxonomy/)
     - taxonomy_id       A list of NCBI taxonomy id's.
     - host_organism     A list of names of the hosts of a virus, if any.
     - host_taxonomy_id  A list of NCBI taxonomy id's of the hosts, if any.
     - references        List of Reference objects.
     - comments          List of strings.
     - cross_references  List of tuples (db, id1[, id2][, id3]).  See the docs.
     - keywords          List of the keywords.
     - features          List of tuples (key name, from, to, description).
       from and to can be either integers for the residue
       numbers, '<', '>', or '?'
     - protein_existence Numerical value describing the evidence for the existence of the protein.
     - seqinfo           tuple of (length, molecular weight, CRC32 value)
     - sequence          The sequence.

    Examples
    --------
    >>> from Bio import SwissProt
    >>> example_filename = "SwissProt/P68308.txt"
    >>> with open(example_filename) as handle:
    ...     records = SwissProt.parse(handle)
    ...     for record in records:
    ...         print(record.entry_name)
    ...         print(record.accessions)
    ...         print(record.keywords)
    ...         print(record.organism)
    ...         print(record.sequence[:20] + "...")
    ...
    NU3M_BALPH
    ['P68308', 'P24973']
    ['Electron transport', 'Membrane', 'Mitochondrion', 'Mitochondrion inner membrane', 'NAD', 'Respiratory chain', 'Translocase', 'Transmembrane', 'Transmembrane helix', 'Transport', 'Ubiquinone']
    Balaenoptera physalus (Fin whale) (Balaena physalus).
    MNLLLTLLTNTTLALLLVFI...

    c                 C   s   d| _ d| _d| _d| _g | _d| _d| _d| _g | _g | _	g | _
d| _g | _g | _g | _g | _g | _g | _g | _g | _g | _d| _d| _d| _dS )Initialize the class.N )
entry_name
data_classmolecule_typesequence_length
accessionscreatedsequence_updateannotation_updatedescription	gene_nameorganism	organelleorganism_classificationtaxonomy_idhost_organismhost_taxonomy_id
referencescommentscross_referenceskeywordsfeaturesprotein_existenceseqinfosequencer
   r   r   r   r	   Z   s0   
zRecord.__init__Nr   r   r   r   r	   r   r   r   r   r   $   s    5r   c                   @   r   )	Referencea  Holds information from one reference in a SwissProt entry.

    Attributes:
     - number      Number of reference in an entry.
     - evidence    Evidence code.  List of strings.
     - positions   Describes extent of work.  List of strings.
     - comments    Comments.  List of (token, text).
     - references  References.  List of (dbname, identifier).
     - authors     The authors of the work.
     - title       Title of the work.
     - location    A citation for the work.

    c                 C   s.   d| _ g | _g | _g | _g | _g | _g | _dS )r   N)number	positionsr*   r)   authorstitlelocationr1   r   r   r   r	      s   
zReference.__init__Nr2   r   r   r   r   r3   y   s    r3   c                   @   s   e Zd ZdZdS )FeatureTablea!  Stores feature annotations for specific regions of the sequence.

    This is a subclass of SeqFeature, defined in Bio.SeqFeature, where the
    attributes are used as follows:

     - ``location``: location of the feature on the canonical or isoform
       sequence; the location is stored as an instance of SimpleLocation,
       defined in Bio.SeqFeature, with the ref attribute set to the isoform
       ID referring to the canonical or isoform sequence on which the feature
       is defined
     - ``id``: unique and stable identifier (FTId), only provided for features
       belonging to the types CARBOHYD, CHAIN, PEPTIDE, PROPEP, VARIANT, or
       VAR_SEQ
     - ``type``: indicates the type of feature, as defined by the UniProt
       Knowledgebase documentation:

        - ACT_SITE: amino acid(s) involved in the activity of an enzyme
        - BINDING:  binding site for any chemical group
        - CARBOHYD: glycosylation site; an FTId identifier to the GlyConnect
          database is provided if annotated there
        - CA_BIND:  calcium-binding region
        - CHAIN:    polypeptide chain in the mature protein
        - COILED:   coiled-coil region
        - COMPBIAS: compositionally biased region
        - CONFLICT: different sources report differing sequences
        - CROSSLNK: posttransationally formed amino acid bond
        - DISULFID: disulfide bond
        - DNA_BIND: DNA-binding region
        - DOMAIN:   domain, defined as a specific combination of secondary
          structures organized into a characteristic three-dimensional
          structure or fold
        - INIT_MET: initiator methionine
        - INTRAMEM: region located in a membrane without crossing it
        - HELIX:    alpha-, 3(10)-, or pi-helix secondary structure
        - LIPID:    covalent binding of a lipid moiety
        - METAL:    binding site for a metal ion
        - MOD_RES:  posttranslational modification (PTM) of a residue,
          annotated by the controlled vocabulary defined by the ptmlist.txt
          document on the UniProt website
        - MOTIF:    short sequence motif of biological interest
        - MUTAGEN:  site experimentally altered by mutagenesis
        - NON_CONS: non-consecutive residues
        - NON_STD:  non-standard amino acid
        - NON_TER:  the residue at an extremity of the sequence is not the
          terminal residue
        - NP_BIND:  nucleotide phosphate-binding region
        - PEPTIDE:  released active mature polypeptide
        - PROPEP:   any processed propeptide
        - REGION:   region of interest in the sequence
        - REPEAT:   internal sequence repetition
        - SIGNAL:   signal sequence (prepeptide)
        - SITE:     amino-acid site of interest not represented by another
          feature key
        - STRAND:   beta-strand secondary structure; either a hydrogen-bonded
          extended beta strand or a residue in an isolated beta-bridge
        - TOPO_DOM: topological domain
        - TRANSIT:  transit peptide (mitochondrion, chloroplast, thylakoid,
          cyanelle, peroxisome, etc.)
        - TRANSMEM: transmembrane region
        - TURN:     H-bonded turn (3-, 4-, or 5-turn)
        - UNSURE:   uncertainties in the sequence
        - VARIANT:  sequence variant; an FTId is provided for protein sequence
          variants of Hominidae (great apes and humans)
        - VAR_SEQ:  sequence variant produced by alternative splicing,
          alternative promoter usage, alternative initiation, or ribosomal
          frameshifting
        - ZN_FING:  zinc finger region

     - ``qualifiers``: a dictionary of additional information, which may include
       the feature evidence and free-text notes. While SwissProt includes the
       feature identifier code (FTId) as a qualifier, it is stored as the
       attribute ID of the FeatureTable object.

    N)r   r   r   r   r   r   r   r   r9      s    r9   c                 c   sP    t | }z	 t|}|sW || ur|  dS dS |V  q|| ur'|  w w )zRead multiple SwissProt records from file.

    Argument source is a file-like object or a path to a file.

    Returns a generator object which yields Bio.SwissProt.Record() objects.
    TN)_open_readclosesourcehandlerecordr   r   r   parse   s   
rA   c                 C   sp   t | }z)t|}|stdz	t| W td ty-   | Y W || ur,|  S S w || ur7|  w w )zRead one SwissProt record from file.

    Argument source is a file-like object or a path to a file.

    Returns a Record() object.
    zNo SwissProt record foundz$More than one SwissProt record found)r:   r;   
ValueErrornextStopIterationr<   r=   r   r   r   read   s"   


rE   c                 C   sJ   zt | }|W S  ty$   | }|ddkr| Y S tj|dd Y S w )Nr   r   ASCII)encoding)open	TypeErrorrE   ioTextIOWrapper)r>   r?   r   r   r   r:     s   r:   c                 C   s`  d }d}zt | }W n ty   | Y S w |d d |dd   }}|dkr/td|dt }t|| g }| D ]j}|d d |dd   }}|rW|d | }d}|dkrj|d	d
}|j| q;|dkrtt	|| q;|dkr|j
|  q;|dkr|dkr|jd q;t|jdkr|jd |jd  |d 7  < q;|dkr|j| q;|dkr| j|dd  7  _q;|dkr|dd
}|j| q;|dkrt|| q;|dkrt|| q;|dkrt }	t|	| |j|	 q;|dkr|jsJ d|jd j| q;|dkr-|js"J d|jd }	t|	|}q;|dkrE|js:J d|jd }	t|	| q;|dkr^|jsRJ d|jd }	|	j| q;|d krw|jskJ d!|jd }	|	j| q;|d"kr|jsJ d#|jd }	|	j| q;|d$kr|jsJ d%|jd }	|	j| q;|d&krt|| q;|d'krt|| q;|d(krt|| q;|d)krt || q;|d*krt!|| q;|d+kr| }t|d,ksJ d-| t"|d. t"|d/ |d f|_#q;|d0kr||$dd  q;|d1krt%| d&|j
|_
d&|j|_|j |_|jD ]S}	d&|	jd	|	_|	jr|	jd }
|	jd.d  D ]}|
'd2sa|
d7 }
|
|7 }
qU|
d	}
|
(d3r~|
'd3r~|
d.d }
nd}
|
|	_d&|	j|	_q9d&||_)|  S |d4krq;td5|d6|d|rt*d7d S )8Nr         IDzFailed to find ID in first liner    AC;; DTDEGNandr   OSOGOC;.OXOHRNRPzRP: missing RNRCzRC: missing RNRXzRX: missing RNRLzRL: missing RNRAzRA: missing RNRGzRG: missing RNRTzRT: missing RNCCDRPEKWFTSQ   zI don't understand SQ line       z  z//-"z**zUnknown keyword z foundzUnexpected end of stream.)+rC   rD   rstripr   r   _read_idsplitr   extend_read_dtr!   appendstripr"   lenr#   r$   r%   _read_ox_read_ohr3   _read_rnr)   r5   _read_rc_read_rxr8   r6   r7   _read_cc_read_dr_read_pe_read_kw_read_ftintr/   replace_read_gnjoinendswith
startswithr0   rB   )r?   r@   unreadr   keyvalue_sequence_linesr   cols	referencer7   fragmentr   r   r   r;     s   





















"





r;   c                 C   s   t | jD ]8\}}|dd}i }|D ]"}| dd\}}|dkr*||d< q|dv s0J |d||< q|| j|< qd S )NrR   =rm   Name)SynonymsOrderedLocusNamesORFNames, )	enumerater"   rq   rs   rw   )r@   itexttokensr"   tokenr   r   r   r   r   r     s   
r   c                 C   s   |dd    }t|dkr+|d | _|d d| _|d d| _t|d | _n$t|dkrI|d | _|d d| _d | _t|d | _ntd|d	d
}| j|vrbd| j}t||d	| jdvrsd| j}t||d	d S )NrM   r   rm   rQ   rL   rn      zID line has unrecognised formatr   )STANDARDPRELIMINARYIPIReviewed
UnreviewedzUnrecognized data class )NPRTzUnrecognized molecule type )	rs   rx   r   rq   r   r   r   r   r   )r@   r   r   allowedmessager   r   r   rr     s(   



rr   c                 C   s  |dd  }|  }|  }d|v sd|v sd|v r| }d}tt|D ]
}d|| v r2|}q(|dks>J d| |d	 }|| d
}	|	dkrPd}
nd|	v rW|	}
nt|	}
|d }d|v rj||
f| _d S d|v ru||
f| _d S d|v r||
f| _d S t	d|dd|v sd|v sd|v rzd}
|d dD ]
}|
 rt|}
qW n ty   d}
Y nw |d d
}d|v r||
f| _d S d|v r||
f| _d S d|v r||
f| _d S t	d|dt	d|d)NrM   CREATEDzLAST SEQUENCE UPDATEzLAST ANNOTATION UPDATErW   zREL.r   z Could not find Rel. in DT line: rm   ,r   .zUnrecognised DT (DaTe) liner   zINTEGRATED INTOzSEQUENCE VERSIONzENTRY VERSION
INTEGRATEDzFailed to parse DT (DaTe) line)upperrq   rs   rangerx   r   r   r   r    r   isdigitrB   )r@   r   r   uprliner   uprcols	rel_indexindexversion_indexstr_versionversiondatesr   r   r   ru     sd   ru   c                 C   sx   | dd }| jr|dd   d}n|dd   d d\}}|dks1J d| | j| d d S )	N{r   rM   rQ   r   
NCBI_TaxIDzUnexpected taxonomy type r   )rs   r&   rq   rt   )r@   r   idsdescrr   r   r   ry   3  s    ry   c                 C   s   |dd   dsJ d| |dd   }|d dkr%|ddks)J ||d d d\}}| j|  | j|  d S )	NrM   zNCBI_TaxID=zUnexpected    rW   r   rQ   rm   )r   rq   countrs   r(   rv   rw   r'   )r@   r   taxidnamer   r   r   rz   K  s    "rz   c                 C   s   | d d}|d }|dr|dsJ d| t|dd | _t|dkrK|d }|dr8|ds?J d	| |dd  d
| _d S d S )Nrm   r   []zMissing brackets rW   r   }zMissing braces |)rs   r   r   r   r4   rx   evidence)r   rnwordsr4   r   r   r   r   r{   U  s   "r{   c           	      C   s   | d}|d dkrd}n|d d |d }}|D ]=}|s" d S |d}|dkrG|d | ||d d  }}| |f}| j| q| jd }| d| }|| jd< q|S )NrQ   rW   r   r   r   rm   rO   )rs   findlstripr*   rv   )	r   r   r   r   colr   r   r   commentr   r   r   r|   f  s    


r|   c                 C   s  | dd}d}d|v rT|d}dd |D }dd |D }|D ]1}|d}t|d	ks2|d
kr6d} nt|d	ksCJ d| | j|d |d df q!n |d}t|d	krbd}n| j|d d|d df |rdd l}ddlm} |	d|| d S d S )Nz [NCBI, ExPASy, Israel, Japan]r   Fr   rR   c                 S   s   g | ]}|  qS r   )rw   .0xr   r   r   
<listcomp>      z_read_rx.<locals>.<listcomp>c                 S   s   g | ]}|r|qS r   r   r   r   r   r   r     r   rL   )DOIr   TzI don't understand RX line r   rm   rQ   r   )BiopythonParserWarningzPossibly corrupt RX line )
r   rs   rx   r)   rv   rq   warningsBior   warn)r   r   r   r   r   r   r   r   r   r   r   r}   |  s.   		

 
$r}   c                 C   sr   |dd |dd    }}|dkr| j| d S |dkr7| js*| j| d S | jd  d| 7  < d S d S )NrM      	   z-!-z   rW   rO   )rq   r*   rv   )r@   r   r   r   r   r   r   r~     s   r~   c                 C   s$   | dd}| jt| d S )Nr   rR   )rq   rs   r+   rv   tuple)r@   r   r   r   r   r   r     s   r   c                 C   s   | d}t|d | _d S )N:r   )rs   r   r.   )r@   r   per   r   r   r     s   
r   c                 C   sD   | ddD ]}|dr|ddd }| j|  qd S )Nr[   rR   r   r   rm   r   )rq   rs   r   rsplitr,   rv   rw   )r@   r   valr   r   r   r     s
   
r   c                 C   s  |dd   }|r|dd dkrI|dd   }z	|d\}}W n ty.   d }Y nw z	|d\}}W n tyE   |}d}Y nw i }n|d	d
  }|dd  }d }|dd   }d|i}t|d}|dkrv|d }nt|}t|||d}t||d |d}	| j	|	 d S | jd }	|dd dkr|dd   }|
dr|dd   d|	_d S |	jd }
|
dr|
 | }n|
 d| }|	jdv rz	|d\}}W n	 ty   Y n-w d}|d}|dkr||d  }|d | }|dd}|dd}|d | | }||	jd< d S |dd    }td|}|r|d}|t|dd  }|
dsHtd|d kr`|dsWtd!|dd |	_d S |drm|dd }n|dd  }||	jv rtd"|d#||	j|< d S t|	j }|d }| d}|	j| }
|d$ks|
dr|
 | }n|
 d| }|	jd%krz	|d\}}W n
 ty   Y n-w d}|d}|dkr||d  }|d | }|dd}|dd}|d | | }||	j|< d S )&NrM         z        P   r   z..r            "   K   r!   rW   rm   )ref)r8   typeid
qualifiersz                             z/FTId=   r   ro   rO   )VARSPLICVAR_SEQz -> z (z^/([a-z_]+)=r   rp   z!Missing starting quote in featurer   zMissing closing quote for idzFeature qualifier z already exists for featurer   r   )rq   rs   rB   r   r   
fromstringr   r9   r-   rv   r   r   r   r   r   r   r   rematchgrouprx   listkeys)r@   r   r   r8   
isoform_idfrom_resto_resr   r!   featureold_description	first_seq
second_seq
extra_infoextra_info_posr   r   qualifier_typer   r   r   r   r   r     s   















r   __main__)run_doctest)verbose) r   rJ   r   Bio.SeqFeaturer   r   r   rB   r   r   r3   r9   rA   rE   r:   r;   r   rr   ru   ry   rz   r{   r|   r}   r~   r   r   r   r   r   
Bio._utilsr   r   r   r   r   <module>   s@   	UM  d
.|