o
    Rŀgu                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ G dd dejZG dd deZG dd deZdS )a  Bio.SeqIO support for the "seqxml" file format, SeqXML.

This module is for reading and writing SeqXML format files as
SeqRecord objects, and is expected to be used via the Bio.SeqIO API.

SeqXML is a lightweight XML format which is supposed be an alternative for
FASTA files. For more Information see http://www.seqXML.org and Schmitt et al
(2011), https://doi.org/10.1093/bib/bbr025
    )sax)handler)XMLGenerator)AttributesImpl)Seq)	SeqRecord   )SequenceIterator)SequenceWriterc                       s   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$d% Zd&d' Z  ZS )(ContentHandlerz5Handles XML events generated by the parser (PRIVATE).c                    s>   t    d| _d| _d| _d| _d| _d| _d| _g | _	dS )z&Create a handler to handle XML events.N)
super__init__sourcesourceVersionseqXMLversion	ncbiTaxIDspeciesNamestartElementNSdatarecordsself	__class__ F/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqIO/SeqXmlIO.pyr       s   

zContentHandler.__init__c                 C   s   | j | _dS )z2Set XML handlers when an XML declaration is found.N)startSeqXMLElementr   r   r   r   r   startDocument,   s   zContentHandler.startDocumentc                 C   sj  |dkrt d|durtdd}| D ]R\}}|\}}|du rR|dkr*|| _q|dkr2|| _q|dkr:|| _q|dkrFt|}	|| _q|d	krN|| _qt d
|dkra|dkr]|}qt dt d| d| jdu rrt d| jdvr{t dd| j d}
|dur||
krt d|| jf | jr| jdkrt d| jr| jdkrt d| j	| _
| j| _dS )z!Handle start of a seqXML element.)NseqXMLz*Failed to find the start of seqXML elementNz#Unexpected qname for seqXML elementr   r   r   r   r   z#Unexpected attribute for XML Schema)http://www.w3.org/2001/XMLSchema-instancenoNamespaceSchemaLocationz0Unexpected attribute for XML Schema in namespaceUnexpected namespace 'z' for seqXML attributezFailed to find seqXMLversion)0.10.20.30.4zUnsupported seqXMLversionzhttp://www.seqxml.org/z/seqxml.xsdzDXML Schema '%s' found not consistent with reported seqXML version %sr%   z@Attribute 'speciesName' on root is only supported in version 0.4z>Attribute 'ncbiTaxID' on root is only supported in version 0.4)
ValueErrorRuntimeErroritemsr   r   r   intr   r   endSeqXMLElementendElementNSstartEntryElementr   )r   nameqnameattrsschemakeyvalue	namespace	localnamenumberurlr   r   r   r   0   s`   


z!ContentHandler.startSeqXMLElementc                 C   sX   |\}}|durt d| d|durt d| d|dkr$t dd| _d| _dS )z!Handle end of the seqXML element.Nr!   z' for seqXML endUnexpected qname 'r   z$Failed to find end of seqXML element)r'   r   r+   r   r-   r.   r3   r4   r   r   r   r*   h   s   
zContentHandler.endSeqXMLElementc           	      C   s"  |dkrt d|durtdtddd}| jdur!| j|jd< | jdur,| j|jd< | j|jd< | D ]7\}}|\}}|du rf|d	krJ||_q6|dkr^| j	d
ksX| j	dkr^||jd< q6t d| dt d| d|jdu rwt d| j
| | j	dkr| j| _n| j| _| j| _dS )z>Set new entry with id and the optional entry source (PRIVATE).Nentryz.Expected to find the start of an entry elementN"Unexpected qname for entry element)idorganism
ncbi_taxidr   r<   r$   r%   zUnexpected attribute z in entry elementr!   z' for entry attributezFailed to find entry IDr"   )r&   r'   r   r   annotationsr   r   r(   r<   r   r   appendstartEntryFieldElementVersion01r   startEntryFieldElementendEntryElementr+   )	r   r-   r.   r/   recordr1   r2   r3   r4   r   r   r   r,   t   s>   






z ContentHandler.startEntryElementc                 C   sL   |dkrt d|durtd| jd jdu rt d| j| _| j| _dS )zHandle end of an entry element.r9   z,Expected to find the end of an entry elementNr;   z+Failed to find a sequence for entry element)r&   r'   r   seqr,   r   r*   r+   )r   r-   r.   r   r   r   rC      s   zContentHandler.endEntryElementc                 C      |\}}|durt d| d| d|dur"td| d| d|dkr+| |S |dkr4| |S |dv r=| |S |d	krF| |S |d
krO| |S t d| d)zCReceive a field of an entry element and forward it for version 0.1.Nr!   ' for  elementr7   speciesdescription)dnaSeqrnaSeqaaSeqalternativeIDpropertyUnexpected field 	 in entryr&   r'   startSpeciesElementstartDescriptionElementstartSequenceElementstartDBRefElementstartPropertyElementr   r-   r.   r/   r3   r4   r   r   r   rA      $   




z.ContentHandler.startEntryFieldElementVersion01c                 C   rG   )zFReceive a field of an entry element and forward it for versions >=0.2.Nr!   rH   rI   r7   rJ   rK   )DNAseqRNAseqAAseqDBRefrP   rQ   rR   rS   rY   r   r   r   rB      rZ   z%ContentHandler.startEntryFieldElementc           
      C   s   d}d}|  D ]-\}}|\}}|du r.|dkr|}q|dkr&t|}|}qtd| dtd| d|du r>td|du rFtd	| jd
 }	||	jd< ||	jd< | j| _dS )zParse the species information.Nr-   r   Unexpected attribute 'z' found in species tagr!   z' for species attributezFailed to find species namezFailed to find ncbiTaxIdrE   r=   r>   )r(   r)   r&   r   r?   endSpeciesElementr+   )
r   r/   r-   r   r1   r2   r3   r4   r5   rD   r   r   r   rT      s0   




z"ContentHandler.startSpeciesElementc                 C   sT   |\}}|durt d| d|durt d| d|dkr$t d| j| _dS )z Handle end of a species element.Nr!   z' for species endr7   rJ   z%Failed to find end of species elementr'   rC   r+   r8   r   r   r   r`      s   z ContentHandler.endSpeciesElementc                 C   :   |rt d| jdurtd| j dd| _| j| _dS )zParse the description.z2Unexpected attributes found in description elementNUnexpected data found: '' )r&   r   r'   endDescriptionElementr+   r   r/   r   r   r   rU         
z&ContentHandler.startDescriptionElementc                 C   st   |\}}|durt d| d|durt d| d|dkr$t d| jd }| j}|r1||_d| _| j| _dS )z(Handle the end of a description element.Nr!   z' for description endr7   rK   z)Failed to find end of description elementrE   )r'   r   r   rK   rC   r+   )r   r-   r.   r3   r4   rD   rK   r   r   r   rf     s   

z$ContentHandler.endDescriptionElementc                 C   rb   )z$Parse DNA, RNA, or protein sequence.z/Unexpected attributes found in sequence elementNrc   rd   re   )r&   r   r'   endSequenceElementr+   rg   r   r   r   rV     rh   z#ContentHandler.startSequenceElementc                 C   s   |\}}|durt d| d|durt d| d| jd }|dkr*| jdks3|dkr9| jdkr9d	|jd
< n8|dkrB| jdksK|dkrQ| jdkrQd|jd
< n |dkrZ| jdksc|dkri| jdkrid|jd
< nt d| dt| j|_d| _| j| _dS )z%Handle the end of a sequence element.Nr!   z' for sequence endr7   rE   r[   r"   rL   DNAmolecule_typer\   rM   RNAr]   rN   proteinz,Failed to find end of sequence (localname = ))	r'   r   r   r?   r   r   rF   rC   r+   )r   r-   r.   r3   r4   rD   r   r   r   ri     s*   

z!ContentHandler.endSequenceElementc                 C   s  d}d}d}|  D ]0\}}|\}}|du r3|dkr|}q
|dkr$|}q
|dkr+|}q
td| dtd| d|du rCtd	|du rKtd
|du r]| jdksY| jdkr]td| jdurktd| j dd| _| jd }	| d| }
|
|	jvr|	j|
 | j| _	dS )z!Parse a database cross reference.Ntyper   r<   r_   z' found for DBRef elementr!   z' for DBRef attributez'Failed to find source for DBRef elementz#Failed to find id for DBRef elementr#   r$   z%Failed to find type for DBRef elementrc   rd   re   rE   :)
r(   r&   r   r   r'   r   dbxrefsr@   endDBRefElementr+   )r   r/   TYPEr   IDr1   r2   r3   r4   rD   dbxrefr   r   r   rW   ;  sB   




z ContentHandler.startDBRefElementc                 C   s   |\}}|durt d| d|durt d| d|dkr%| jdks.|dkr6| jdkr6t d| d| jrBt d	| j d
d| _| j| _dS )z"Handle the end of a DBRef element.Nr!   z' for DBRef elementr7   r^   r"   rO   Unexpected localname 'z-Unexpected data received for DBRef element: 'rd   )r'   r   r   rC   r+   r8   r   r   r   rr   d  s   zContentHandler.endDBRefElementc           	      C   s   d}d}|  D ]&\}}|\}}|du r'|dkr|}q|dkr"|}qtd|td| d|du r7td| jd }|d	krO|j| |v sIJ ||j|< n||jvrYg |j|< |j| | | j| _dS )
z'Handle the start of a property element.Nr-   r2   z4Unexpected attribute '%s' found for property elementr!   z' for property attributez(Failed to find name for property elementrE   rk   )r(   r&   r   r?   r@   endPropertyElementr+   )	r   r/   property_nameproperty_valuer1   r2   r3   r4   rD   r   r   r   rX   v  s2   



z#ContentHandler.startPropertyElementc                 C   s\   |\}}|durt d| d|durt d| d|dkr(t d| d| j| _dS )z%Handle the end of a property element.Nr!   z' for property elementr7   rP   rv   ra   r8   r   r   r   rw     s   

z!ContentHandler.endPropertyElementc                 C   s    | j dur|  j |7  _ dS dS )zHandle character data.N)r   )r   r   r   r   r   
characters  s   
zContentHandler.characters)__name__
__module____qualname____doc__r   r   r   r*   r,   rC   rA   rB   rT   r`   rU   rf   rV   ri   rW   rr   rX   rw   rz   __classcell__r   r   r   r   r      s*    8&!		)#r   c                       s6   e Zd ZdZdZd
 fdd	Zdd Zdd	 Z  ZS )SeqXmlIteratoraN  Parser for seqXML files.

    Parses seqXML files and creates SeqRecords.
    Assumes valid seqXML please validate beforehand.
    It is assumed that all information for one record can be found within a
    record element or above. Two types of methods are called when the start
    tag of an element is reached. To receive only the attributes of an
    element before its end tag is reached implement _attr_TAGNAME.
    To get an element and its children as a DOM tree implement _elem_TAGNAME.
    Everything that is part of the DOM tree will not trigger any further
    method calls.
    i   Nc                    sB   t  | _t }| j| | jtjd t j	|ddd dS )z0Create the object and initialize the XML parser.TbSeqXML)modefmtN)
r   make_parserparserr   setContentHandler
setFeaturer   feature_namespacesr   r   )r   stream_or_pathr3   content_handlerr   r   r   r     s
   
zSeqXmlIterator.__init__c                 C   s   | j }| }| j}	 ||}|s|jdu rtdtd|| |j}|dur,nq|| _|j| _|j	| _	|j
| _
|j| _| |}|S )z9Start parsing the file, and return a SeqRecord generator.TNzEmpty file.zXML file contains no data.)r   getContentHandlerBLOCKreadr   r&   feedr   r   r   r   r   iterate)r   handler   r   r   textr   r   r   r   r   parse  s*   



zSeqXmlIterator.parsec                 c   st    | j }| }|j}| j}	 t|dkr|d}|V  ||}|s%n|| q|  |E dH  |	  dS )z)Iterate over the records in the XML file.Tr   r   N)
r   r   r   r   lenpopr   r   closeclear)r   r   r   r   r   r   rD   r   r   r   r   r     s"   



zSeqXmlIterator.iterate)N)	r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r     s    r   c                       sd   e Zd ZdZ	d fdd	Zdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Z  ZS )SeqXmlWriterzWrites SeqRecords into seqXML file.

    SeqXML requires the SeqRecord annotations to specify the molecule_type;
    the molecule type is required to contain the term "DNA", "RNA", or
    "protein".
    Nc                    sF   t  |d | j}t|d| _| j  || _|| _|| _|| _	dS )a1  Create Object and start the xml generator.

        Arguments:
         - target - Output stream opened in binary mode, or a path to a file.
         - source - The source program/database of the file, for example
           UniProt.
         - source_version - The version or release number of the source
           program or database from which the data originated.
         - species - The scientific name of the species of origin of all
           entries in the file.
         - ncbiTaxId - The NCBI taxonomy identifier of the species of origin.

        wbzutf-8N)
r   r   r   r   xml_generatorr   r   source_versionrJ   	ncbiTaxId)r   targetr   r   rJ   r   r   r   r   r   r     s   

zSeqXmlWriter.__init__c                 C   s   dddd}| j dur| j |d< | jdur| j|d< | jdur.t| jts)td| j|d	< | jdurDt| jttfs?td
| j|d< | j	dt
| dS )z'Write root node with document metadata.r   z$http://www.seqxml.org/0.4/seqxml.xsdr%   )z	xmlns:xsizxsi:noNamespaceSchemaLocationr   Nr   r   z species should be of type stringr   )ncbiTaxID should be of type string or intr   r   )r   r   rJ   
isinstancestr	TypeErrorr   r)   r   startElementr   rg   r   r   r   write_header  s"   







zSeqXmlWriter.write_headerc                 C   s   |j r|j dkrtdt|j tstdd|j i}d|jv r;| j|jd kr;t|jd ts4td|jd |d< | jdt	| | 
| | | | | | | | | | jd dS )	zWrite one record.z<unknown id>zSeqXML requires identifierz#Identifier should be of type stringr<   r   zsource should be of type stringr:   N)r<   r&   r   r   r   r?   r   r   r   r   _write_species_write_description
_write_seq_write_dbxrefs_write_properties
endElement)r   rD   attrbr   r   r   write_record5  s"   






zSeqXmlWriter.write_recordc                 C   s   | j d | j   dS )z0Close the root node and finish the XML document.r   N)r   r   endDocumentr   r   r   r   write_footerO  s   zSeqXmlWriter.write_footerc                 C   s   d}d|j v r+|j d }t|tr+t|dkr|d }nt|dkr%d}ntd| d|j v rm|ro|j d }t|ts@tdt|ttfsKtd|| jksU|| j	krq|t|d	}| j
d
t| | j
d
 dS dS dS dS )z%Write the species if given (PRIVATE).Nr>   r   r   z9Multiple entries for record.annotations['ncbi_taxid'], %rr=   z!organism should be of type stringr   )r-   r   rJ   )r?   r   listr   r&   r   r   r)   rJ   r   r   r   r   r   )r   rD   local_ncbi_taxid	local_orgattrr   r   r   r   T  s2   






zSeqXmlWriter._write_speciesc                 C   sp   |j r4t|j tstd|j }|dkrd}t|j dkr6| jdti  | j| | j	d dS dS dS )z)Write the description if given (PRIVATE).z$Description should be of type stringz<unknown description>re   r   rK   N)
rK   r   r   r   r   r   r   r   rz   r   )r   rD   rK   r   r   r   r   s  s   zSeqXmlWriter._write_descriptionc                 C   s   t |j}t|dkstd|jd}|du rtdd|v r$d}nd|v r+d	}nd
|v r2d}ntd| d| j|ti  | j	| | j
| dS )zWrite the sequence (PRIVATE).

        Note that SeqXML requires the molecule type to contain the term
        "DNA", "RNA", or "protein".
        r   z,The sequence length should be greater than 0rk   Nzmolecule_type is not definedrj   r[   rl   r\   rm   r]   zunknown molecule_type 'rd   )bytesrF   r   r&   r?   getr   r   r   rz   r   )r   rD   rF   rk   seqElemr   r   r   r     s    
zSeqXmlWriter._write_seqc                 C   s~   |j dur;|j D ]4}t|tstd|ddk rtd|dd\}}||d}| jdt	| | j
d qdS dS )z.Write all database cross references (PRIVATE).Nz(dbxrefs should be of type list of stringrp   r   z9dbxrefs should be in the form ['source:id', 'source:id' ])r   r<   r^   )rq   r   r   r   findr&   splitr   r   r   r   )r   rD   ru   dbsourcedbidr   r   r   r   r     s   



zSeqXmlWriter._write_dbxrefsc                 C   s   |j  D ]h\}}|dvrm|du r%d|i}| jdt| | jd qt|trO|D ]!}|du r7d|i}n|t|d}| jdt| | jd q,qt|t	t
tfrm|t|d}| jdt| | jd qdS )ztWrite all annotations that are key value pairs with values of a primitive type or list of primitive types (PRIVATE).)r=   r>   r   Nr-   rP   )r-   r2   )r?   r(   r   r   r   r   r   r   r   r)   float)r   rD   r1   r2   r   vr   r   r   r     s,   


zSeqXmlWriter._write_properties)NNNN)r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r     s    r   N)r~   xmlr   xml.saxr   xml.sax.saxutilsr   xml.sax.xmlreaderr   Bio.Seqr   Bio.SeqRecordr   
Interfacesr	   r
   r   r   r   r   r   r   r   <module>   s   
   O