o
    Rŀgd                     @   sd   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 dZdZdddZG dd dZd	S )zBio.SeqIO support for the "uniprot-xml" file format.

See Also:
http://www.uniprot.org

The UniProt XML format essentially replaces the old plain text file format
originally introduced by SwissProt ("swiss" format in Bio.SeqIO).

    )ElementTree)errors)
SeqFeature)Seq)	SeqRecordz{http://uniprot.org/uniprot}z4%(name)s %(volume)s:%(first)s-%(last)s(%(pub_date)s)NFc              
   c   s    |dur	t dzItj| ddD ]>\}}|dkr7|d ds7td|d  d	ks7t d
t d|d  d	|dkrO|jtd krOt||d V  |  qW dS  tj	yu } zt
j|j t
jkrp|jdkskJ t dd d}~ww )a  Iterate over UniProt XML as SeqRecord objects.

    parses an XML entry at a time from any UniProt XML file
    returns a SeqRecord for each iteration

    This generator can be used in Bio.SeqIO

    Argument source is a file-like object or a path to a file.

    Optional argument alphabet should not be used anymore.

    return_raw_comments = True --> comment fields are returned as complete XML to allow further processing
    skip_parsing_errors = True --> if parsing errors are found, skip to next entry
    N,The alphabet argument is no longer supported)startstart-nsend)eventsr	      zhttp://www.w3.org/{}z;SeqIO format 'uniprot-xml' only parses xml with namespace: z but xml has namespace: {r
   entry)return_raw_comments)r   r   zEmpty file.)
ValueErrorr   	iterparse
startswithNStagParserparseclear
ParseErrorr   messagescodeXML_ERROR_NO_ELEMENTSposition)sourcealphabetr   eventelem	exception r#   G/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqIO/UniprotIO.pyUniprotIterator   s4   
r%   c                   @   s"   e Zd ZdZdddZdd ZdS )	r   zParse a UniProt XML entry to a SeqRecord.

    Optional argument alphabet is no longer used.

    return_raw_comments=True to get back the complete comment field in XML format
    NFc                 C   s    |durt d|| _|| _dS )zInitialize the class.Nr   )r   r   r   )selfr!   r   r   r#   r#   r$   __init__L   s   
zParser.__init__c                    s  j jtd ks
J fddfdd}fdd}fdd	}fd
d}fdd}fdd}fdd}fdd}fdd}	fdd}
fdd}d;dd  fdd}fdd }fd!d"}fd#d$}td%d&d'_j jd(d)_j j D ]\}}|d*v rt	|jj
|< q|jj
|< qj D ]}|jtd+ kr|| q|jtd, kr|| q|jtd- kr|| q|jtd. kr|| q|jtd/ kr|| q|jtd0 kr|| q|jtd1 kr|| q|jtd2 kr
|| q|jtd3 kr|	| q|jtd4 kr$|
| q|jtd5 kr1|| q|jtd6 kr>|| q|jtd7 krK|| q|jtd8 krX|| q|jtd9 kre|| q	 qttjjj_jjsjj
d: d j_jS )<zParse the input.r   c                    sB   |  j jvrg  j j| < | j j|  vr j j|  | d S d S N)ParsedSeqRecordannotationsappend)keyvaluer&   r#   r$   append_to_annotationsW   s
   z+Parser.parse.<locals>.append_to_annotationsc                    s(   | j  j_ jj jd | j   d S )N:)textr)   namedbxrefsr+   dbnameelementr.   r#   r$   _parse_name]   s   
z!Parser.parse.<locals>._parse_namec                    s*    d| j  jjjd | j   d S )N
accessionsr0   )r1   r)   r3   r+   r4   r5   r/   r&   r#   r$   _parse_accessiona   s   z&Parser.parse.<locals>._parse_accessionc                    s   d}| D ]L}|j td td td fv r@|D ](}d|j td|j tdf } ||j |j td kr>|s>|jj_d}qq|j td	 krHq|j td
 krP	 qdS )zParse protein names (PRIVATE).FrecommendedNamesubmittedNamealternativeNamez%s_%s fullNameT	componentdomainN)r   r   replacer1   r)   description)r6   	descr_setprotein_elementrec_nameann_keyr9   r#   r$   _parse_proteing   s0   
	z$Parser.parse.<locals>._parse_proteinc                    s^   | D ]*}d|j v r,d|jtd|j d f }|j d dkr&|jjj|< q ||j qd S )Ntypez
gene_%s_%sr>   primary)attribr   rB   r   r1   r)   r*   )r6   genename_elementrG   r9   r#   r$   _parse_gene   s   
z!Parser.parse.<locals>._parse_genec                        d| j d  d S )NgeneLocationrI   rK   r5   r/   r#   r$   _parse_geneLocation      z)Parser.parse.<locals>._parse_geneLocationc                    s  d } }}| D ][}|j td kr1|jr0|jd dkr|j}q|jd dkr*|j}q d|j q|j td krJjj|jd d |jd	   q|j td
 krc|D ]}|j td krb d|j qSq|rq|rq| d| d}n	|rv|}n|rz|}|jjd< d S )Nr>   r2   rI   
scientificcommonorganism_namedbReferencer0   idlineagetaxontaxonomyz ()organism)r   r   r1   rK   r)   r3   r+   r*   )r6   rV   com_namesci_nameorganism_elementtaxon_elementr9   r#   r$   _parse_organism   s@   z%Parser.parse.<locals>._parse_organismc                    s(   | D ]}|j td kr d|j qd S )Nr2   organism_host)r   r   r1   )r6   r`   rQ   r#   r$   _parse_organismHost   s
   z)Parser.parse.<locals>._parse_organismHostc                    s    d| j  d S )Nkeywords)r1   r5   rQ   r#   r$   _parse_keyword   s   z$Parser.parse.<locals>._parse_keywordc              
      s\  g d}| j d |v r,d| j d dd }| td D ]}|jr) ||j qne| j d dkr_| td D ]"}|D ]}|jr[d	| j d dd|jtdf } ||j q>q:n2| j d d
kr| td D ]}d| j d  d} ||j d  qmn| j d dkr| td D ]}d| j d dd }|td D ]} ||j qqn| j d dkrNd| j d dd }d }	}
| td D ]I}t|td }z2|rt|d j d }
|
d }	n tt|td j d }	|	d8 }	tt|td j d }
W q t	t
fy   Y qw | j d }| j d }|	|
  kr/dkr=n n |d| d|  nT ||	 d|
 d| d|  nC| j d dkrWn:| j d d kr| td! D ]*}d| j d dd }|td! D ]} || j d"  d#|j d$   q{qfjrd| j d dd d%} |t|  d&S d&S )'ac  Parse comments (PRIVATE).

            Comment fields are very heterogeneus. each type has his own (frequently mutated) schema.
            To store all the contained data, more complex data structures are needed, such as
            annotated dictionaries. This is left to end user, by optionally setting:

            return_raw_comments=True

            The original XML is returned in the annotation fields.

            Available comment types at december 2009:
             - "allergen"
             - "alternative products"
             - "biotechnology"
             - "biophysicochemical properties"
             - "catalytic activity"
             - "caution"
             - "cofactor"
             - "developmental stage"
             - "disease"
             - "domain"
             - "disruption phenotype"
             - "enzyme regulation"
             - "function"
             - "induction"
             - "miscellaneous"
             - "pathway"
             - "pharmaceutical"
             - "polymorphism"
             - "PTM"
             - "RNA editing"
             - "similarity"
             - "subcellular location"
             - "sequence caution"
             - "subunit"
             - "tissue specificity"
             - "toxic dose"
             - "online information"
             - "mass spectrometry"
             - "interaction"

            )allergenbiotechnologyzbiophysicochemical propertieszcatalytic activitycautioncofactorzdevelopmental stagediseaserA   zdisruption phenotypezenzyme regulationfunction	inductionmiscellaneouspathwaypharmaceuticalpolymorphismPTMzRNA editing
similaritysubunitztissue specificityz
toxic doserI   comment_ r>   r1   zsubcellular locationsubcellularLocationzcomment_%s_%sinteractioninteractant	_intactIdintactIdzalternative productsisoformzcomment_%s_isoformrX   zmass spectrometryr   locationr   r   beginr
   massmethodz
undefined:|z..r0   zsequence cautionzonline informationlinkr2   @uri_xmlN)rK   rB   iterr   r1   r   listintnextr   KeyErrorr   r   tostring)r6   simple_commentsrG   text_elementsubloc_elementelinteract_elementalt_element
id_elementr   r
   pos_elsr   r   link_elementr9   r#   r$   _parse_comment   s   +


"z$Parser.parse.<locals>._parse_commentc                    sl   j j| jd d | jd   d| jv r| jd dkrd}d}| D ]}|jtd kr|jd }|dkr:|jd }|d	krC|jd }|d
kr|jd d}|D ]T}| d}|d dkrt }| jd |_	| jd |j
d< ||j
d< ||j
d	< |d d|j
d
< t|d dd d }	t|d dd }
t|	|
|_qQq#| D ]
}|jtd kr	 qd S )NrI   r0   rX   PDBr>   propertyr   r-   
resolutionchains,=r   -r2   r   /)r)   r3   r+   rK   r   r   splitstripr   rI   
qualifiersr   SimpleLocationr}   )r6   r   r   ref_elementdat_typepairsr!   pairfeaturer   r
   r.   r#   r$   _parse_dbReference=  sP   
	






z(Parser.parse.<locals>._parse_dbReferencec                    sJ  t  }g }g }g }d}d}d}| D ]}|jtd kr|jd }|dkr-|d|jd  7 }d|jv r7|jd }|jdd}|jd	d}	|jd
d}
|jdd}|D ]V}|jtd krc|j|_qU|jtd krx|D ]
}||jd  qlqU|jtd krj	j
|jd d |jd   |jd dkr|jd |_qU|jd dkr|jd |_qUq|jtd kr||j q|jtd kr|D ]}|jtd kr||j qq|rdd| }nd}|rdd| }nd}g |_d||_|r|r|	r|
r|rt||	|
||d |_n||_d||||f|_ d| d S )Nr>   citationrI   
submissionz to the dbr2   datevolumefirstlasttitle
authorListrW   r0   rX   PubMedMEDLINEscoper   tissuezScope: z, zTissue: )r2   r   r   r   pub_datez | 
references)r   	Referencer   r   rK   getr1   r   r+   r)   r3   	pubmed_id
medline_idjoinr}   authorsREFERENCE_JOURNALjournalcomment)r6   	referencer   scopestissuesjournal_namepub_typer   r   j_volumej_firstj_lastcit_elementperson_elementsource_element
scopes_strtissues_strr9   r#   r$   _parse_referencem  s   




z&Parser.parse.<locals>._parse_referencer   c                 S   s   zt | jd | }W n ty   d }Y nw | jdd}|dkr,|d u s(J t S |s3t|S |dkr<t|S |dkrEt|S |dkrNt	|S t
d|)	Nr   statusr>   unknownzgreater thanz	less than	uncertainzPosition status )r   rK   r   r   r   UnknownPositionExactPositionAfterPositionBeforePositionUncertainPositionNotImplementedError)r6   offsetr   r   r#   r#   r$   _parse_position  s$   



z%Parser.parse.<locals>._parse_positionc              	      s  t   }| j D ]	\}}||j|< q	| jdd|_d| jv r&| jd |_| D ]Y}|jtd krk|	td }|rH|d }  | d} | }n|	td d }  | d}|	td	 d }  | }t 
|||_q(z|j|j|jtd< W q( ty   Y q(w jj| d S )
NrI   r>   rX   r}   r   r   r~   r
   )r   rK   itemsr   r   rI   rX   r   r   findallr   r}   r1   rB   	Exceptionr)   featuresr+   )r6   r   kvfeature_elementposition_elementsstart_positionend_position)r   r&   r#   r$   _parse_feature  s6   



z$Parser.parse.<locals>._parse_featurec                    rN   )NproteinExistencerI   rP   r5   rQ   r#   r$   _parse_proteinExistence  rS   z-Parser.parse.<locals>._parse_proteinExistencec                    s&   | j  D ]\}}|} || qd S r(   )rK   r   )r6   r   r   rG   rQ   r#   r$   _parse_evidence  s   z%Parser.parse.<locals>._parse_evidencec                    sn   | j  D ]\}}|dv rt| jjd| < q| jjd| < qtd| j  j_	d jjd< d S )N)lengthr   version	sequence_r>   proteinmolecule_type)
rK   r   r   r)   r*   r   r   r1   r   seq)r6   r   r   r.   r#   r$   _parse_sequence  s   z%Parser.parse.<locals>._parse_sequenceNr>   )rX   datasetUnknownDatasetr   r2   	accessionr   generO   r]   organismHostkeywordr   rW   r   r   r   evidencesequencer8   )r   )r   r   r   r   r)   rK   r   r4   r   r   r*   sortedsetr3   rX   )r&   r7   r:   rH   rM   rR   rb   rd   rf   r   r   r   r   r   r   r   r   r   r6   r#   )r   r/   r&   r$   r   S   sz     0
G
















zParser.parseNF)__name__
__module____qualname____doc__r'   r   r#   r#   r#   r$   r   D   s    
r   r   )r   	xml.etreer   xml.parsers.expatr   Bior   Bio.Seqr   Bio.SeqRecordr   r   r   r%   r   r#   r#   r#   r$   <module>   s   

&