o
    Rŀgj                     @   sp  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 G dd	 d	eZ
G d
d de
ZG dd deZG dd de
ZG dd deZG dd deZG dd deZG dd deZG dd de
ZG dd de
ZG dd de
Zi deded ed!ed"ed#ed$ed%ed&ed'ed(ed)ed*ed+ed,ed-ed.eeed/ZdS )0aO  Dictionary like indexing of sequence files (PRIVATE).

You are not expected to access this module, or any of its code, directly. This
is all handled internally by the Bio.SeqIO.index(...) and index_db(...)
functions which are the public interface for this functionality.

The basic idea is that we scan over a sequence file, looking for new record
markers. We then try to extract the string that Bio.SeqIO.parse/read would
use as the record id, ideally without actually parsing the full record. We
then use a subclassed Python dictionary to record the file offset for the
record start against the record id.

Note that this means full parsing is on demand, so any invalid or problem
record may not trigger an exception until it is accessed. This is by design.

This means our dictionary like objects have in memory ALL the keys (all the
record identifiers), which shouldn't be a problem even with second generation
sequencing. If memory is an issue, the index_db(...) interface stores the
keys and offsets in an SQLite database - which can be re-used to avoid
re-indexing the file for use another time.
    N)BytesIO)StringIO)SeqIO)_IndexedSeqFileProxy)_open_for_random_accessc                   @       e Zd ZdZdd Zdd ZdS )SeqFileRandomAccessz8Base class for defining random access to sequence files.c                 C   s    t || _|| _tj| | _dS Initialize the class.N)r   _handle_formatr   _FormatToIterator	_iteratorselffilenameformat r   D/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqIO/_index.py__init__)   s   
zSeqFileRandomAccess.__init__c                 C   s   t | t| | S )zReturn SeqRecord.)nextr   r   get_rawdecode)r   offsetr   r   r   get1   s   zSeqFileRandomAccess.getN)__name__
__module____qualname____doc__r   r   r   r   r   r   r   &   s    r   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )SffRandomAccessz7Random access to a Standard Flowgram Format (SFF) file.c                 C   s4   t | || tj| j\}}}}| _| _| _dS r	   )	r   r   r   SffIO_sff_file_headerr   _flows_per_read_flow_chars_key_sequence)r   r   r   header_lengthindex_offsetindex_lengthnumber_of_readsr   r   r   r   A   s   	zSffRandomAccess.__init__c              
   c   s   | j }|d tj|\}}}}| _| _| _|r|rd}d}z(tj|D ]\}}	t	||	}||	dfV  |d7 }q(||krHt
d||f W n2 t
y{ }
 z&ddl}ddlm} |d|
 | |dkslJ d|d W Y d}
~
n"d}
~
ww || |kr|| tj|| j tj||| dS d}tj|D ]\}}	||	dfV  |d7 }q||krt
d||f tj||| dS )zELoad any index block in the file, or build it the slow way (PRIVATE).r      zIndexed %i records, expected %iN)BiopythonParserWarningzCould not parse the SFF index: zPartially populated index)r   seekr   r    r!   r"   r#   r$   _sff_read_roche_indexmax
ValueErrorwarningsBior*   warn_sff_read_raw_record
_check_eof_sff_do_slow_index)r   handler%   r&   r'   r(   count
max_offsetnamer   errr/   r*   r   r   r   __iter__N   sb   









zSffRandomAccess.__iter__c                 C   s(   | j }|| tj|| j| j| jS )2Return the SeqRecord starting at the given offset.r   r+   r   r    _sff_read_seq_recordr"   r#   r$   r   r   r5   r   r   r   r      s
   
zSffRandomAccess.getc                 C   s    | j }|| tj|| jS 6Return the raw record from the file as a bytes string.)r   r+   r   r    r2   r"   r>   r   r   r   r      s   
zSffRandomAccess.get_rawN)r   r   r   r   r   r:   r   r   r   r   r   r   r   >   s    >r   c                   @      e Zd ZdZdd ZdS )SffTrimedRandomAccesszLRandom access to an SFF file with defined trimming applied to each sequence.c                 C   s,   | j }|| tjj|| j| j| jddS )r;   T)trimr<   r>   r   r   r   r      s   
zSffTrimedRandomAccess.getN)r   r   r   r   r   r   r   r   r   rB          rB   c                   @   (   e Zd ZdZdd Zdd Zdd ZdS )	SequentialSeqFileRandomAccessz3Random access to a simple sequential sequence file.c                 C   sH   t | || dddddddddddd| }|| _td	| | _d
S )r
   s   CO s   ID    >s   LOCUS s   BEGIN_SEQUENCEs   >..;s   <entry )aceemblfastagenbankgbimgtphdpirqualswissuniprot-xml   ^N)r   r   _markerrecompile
_marker_re)r   r   r   markerr   r   r   r      s"   z&SequentialSeqFileRandomAccess.__init__c           	      c   s    t | j}| j}| j}|d 	 | }| }||s!|s"nq||ra||d  	ddd }t |}	 | }| }||sJ|sU|
 ||fV  |}n|t |7 }q;||s(|riJ t|dS )z#Return (id, offset, length) tuples.r   TNr)   )lenrT   rW   r   r+   tellreadlinematchstripsplitr   repr)	r   marker_offset	marker_rer5   start_offsetlineidlength
end_offsetr   r   r   r:      s2   



z&SequentialSeqFileRandomAccess.__iter__c                 C   sP   | j }| j}|| | g}	 | }||s|sn|| qd|S )r@   T    )r   rW   r+   r[   r\   appendjoinr   r   r5   ra   linesrc   r   r   r   r      s   



z%SequentialSeqFileRandomAccess.get_rawNr   r   r   r   r   r:   r   r   r   r   r   rF      s
    rF   c                   @   rA   )GenBankRandomAccessz1Indexed dictionary like access to a GenBank file.c                 c   s   | j }|d | j}d}d}	 | }| }||s |s!nq||rz|dd ddd }W n ty@   d}Y nw t|}	 | }	| }||sU|sf|s[td|	 ||fV  |	}nK|
|rz
|  d }W n4 ty~   Y n,w |
|rz|  d }
|
d	dkr|
d	d  r|
}W n	 ty   Y nw |t|7 }qF||s'|rJ t|dS )
.Iterate over the sequence records in the file.r   s
   ACCESSION s   VERSION T   Nr)   z1Did not find usable ACCESSION/VERSION/LOCUS lines   .)r   r+   rW   rZ   r[   r\   r^   r.   rY   r   
startswithrstrip
IndexErrorr6   isdigitr_   )r   r5   ra   accession_markerversion_markerrb   rc   keyre   rf   
version_idr   r   r   r:      sf   




*zGenBankRandomAccess.__iter__Nr   r   r   r   r:   r   r   r   r   rm      rD   rm   c                   @   rA   )EmblRandomAccessz/Indexed dictionary like access to an EMBL file.c                 c   s   | j }|d | j}d}d}	 | }| }||s |s!nq||rd}t|}|dd dd	v rh|d
d  	d}	|	d 
 |ra|	d 
 d |	d 
 	 d  }
d}n3|	d 
 }
n,|dd ddv r|d
d 
 	ddd }
|
dr|
dd }
ntd|	 | }||s|s| t| }|
 ||fV  |}n1||r|s| 	 d }
|
dr|
dd }
n||r| 	 d }
d}|t|7 }q||s'|rJ t|dS )rn   r   s   SV    AC TF   N   ;)ro         r)   rp   )r|   r   z&Did not recognise the ID line layout:
)r   r+   rW   rZ   r[   r\   rY   r6   rr   r^   r]   rq   endswithr.   r   r_   )r   r5   ra   	sv_marker	ac_markerrb   rc   setbysvre   partsrw   rf   r   r   r   r:   :  s\   

$



&zEmblRandomAccess.__iter__Nry   r   r   r   r   rz   7  rD   rz   c                   @   rA   )SwissRandomAccessz"Random access to a SwissProt file.c                 c   s    | j }|d | j}	 | }| }||s|snq||rnt|}| }|t|7 }|ds8J |dd  	dd  }	 | }| }||sW|sb|
 ||fV  |}n|t|7 }qH||s#|rvJ t|dS )rn   r   Tr{   r   Nr}   )r   r+   rW   rZ   r[   r\   rY   rq   r]   r^   r   r_   )r   r5   ra   rb   rc   re   rw   rf   r   r   r   r:   t  s6   


zSwissRandomAccess.__iter__Nry   r   r   r   r   r   q  rD   r   c                   @   rE   )	UniprotRandomAccessz$Random access to a UniProt XML file.c                 c   s   | j }|d | j}d}d}d}	 | }| }||s"|s#nq||rt|}d}		 | }|	du r\||v r\||v sDJ ||||d d dd	d }	|t|7 }n6||v r|||d
 7 }| t| || d
 }
|| |
ksJ n||s|st	d|t|7 }q0|	st	d||| f |	
 ||fV  ||s|r| }| }||s|s||s)|rJ t|dS )rn   r   s   <accession>s   </accession>   </entry>TN      <r)      Didn't find end of recordz/Did not find <accession> line in bytes %i to %i)r   r+   rW   rZ   r[   r\   rY   findr^   r.   r   r_   )r   r5   ra   start_acc_markerend_acc_markerend_entry_markerrb   rc   re   rw   rf   r   r   r   r:     sZ   

"

"zUniprotRandomAccess.__iter__c                 C   s   | j }| j}d}|| | g}	 | }||}|dkr,||d|d   n||s3|s7td|| qd|S )r@   r   Tr   Nr   r   rg   )	r   rW   r+   r[   r   rh   r\   r.   ri   )r   r   r5   ra   r   datarc   ir   r   r   r     s    





zUniprotRandomAccess.get_rawc                 C   s&   d|  | d }ttjt|S )r;   s  <?xml version='1.0' encoding='UTF-8'?>
        <uniprot xmlns="http://uniprot.org/uniprot"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
        xsi:schemaLocation="http://uniprot.org/uniprot
        http://www.uniprot.org/support/docs/uniprot.xsd">
        s
   </uniprot>)r   r   r   	UniprotIOUniprotIteratorr   )r   r   r   r   r   r   r     s   
zUniprotRandomAccess.getN)r   r   r   r   r:   r   r   r   r   r   r   r     s
    3r   c                   @   rE   )	IntelliGeneticsRandomAccessz(Random access to a IntelliGenetics file.c                 C   s   t | || td| _dS )r
   s   ^;N)r   r   rU   rV   rW   r   r   r   r   r     s   z$IntelliGeneticsRandomAccess.__init__c                 c   s   | j }|d d}d}	 |t|7 }| }|sn|ds!nq|rd}|t| | ks2J |ds>td||drR|t|7 }| }|dsC| }|rn|dsn|t|7 }| }|rn|dr]| ||fV  ||7 }|t| | ksJ |s$dS dS )rn   r    Ts   ;;r}   z'Records should start with ';' and not:
N)	r   r+   rY   r[   rq   rZ   r.   rr   r   )r   r5   r   rc   re   rw   r   r   r   r:     s>   




z$IntelliGeneticsRandomAccess.__iter__c                 C   s   | j }|| | j}g }| }|dr$|| | }|ds|r;|ds;|| | }|r;|dr+d|S )r@   r}   rg   )r   r+   rW   r[   rq   rh   ri   rj   r   r   r   r     s   





z#IntelliGeneticsRandomAccess.get_rawNrl   r   r   r   r   r     s
    r   c                   @   r   )TabRandomAccessz&Random access to a simple tabbed file.c                 c   sv    | j }|d d}	 | }| }|sdS z	||d }W n ty/   | s.Y q w | |t|fV  q)rn   r      	TN)	r   r+   rZ   r[   r^   r.   r]   r   rY   )r   r5   tab_charrb   rc   rw   r   r   r   r:   (  s$   
zTabRandomAccess.__iter__c                 C   s   | j }|| | S r?   )r   r+   r[   r>   r   r   r   r   =  s   
zTabRandomAccess.get_rawNr   r   r   r   r:   r   r   r   r   r   r   %  s    r   c                   @   r   )FastqRandomAccesszRandom access to a FASTQ file (any supported variant).

    With FASTQ the records all start with a "@" line, but so can quality lines.
    Note this will cope with line-wrapped FASTQ files.
    c           	      c   s   | j }|d d}| }| }|sdS |dd dkr&td||r|dd  ddd }d}t|}|rX| }|t|7 }|drNn
|t|	 7 }|s>|s^tdd}|r||kr|dkr| }|	 rytd||t|7 }| }| }|r|dd dkrtd	|n| }|t|	 7 }|t|7 }|sb||krtd
|
 ||fV  |}|s(dS dS )rn   r   Nr)      @Problem with FASTQ @ line:
   +$Premature end of file in seq section!Expected blank quality line, not Problem with line Problem with quality section)r   r+   rZ   r[   r.   rr   r^   rY   rq   r]   r   )	r   r5   rd   rb   rc   seq_lenre   qual_lenrf   r   r   r   r:   P  s^   

zFastqRandomAccess.__iter__c                 C   s4  | j }|| | }|}|dd dkrtd|d}|r9| }||7 }|dr/n
|t| 7 }|s!|s?td|dd dksIJ d}|r||kr~|dkrh| }| rdtd|||7 }| }|r}|dd dkr}td|n| }||7 }|t| 7 }|sM||krtd	|S )
r@   r   r)   r   r   r   r   r   r   r   )r   r+   r[   r.   rq   rY   r]   )r   r   r5   rc   r   r   r   r   r   r   r     sH   

zFastqRandomAccess.get_rawNr   r   r   r   r   r   I  s    7r   rH   rI   rJ   fastqzfastq-sangerzfastq-solexazfastq-illuminarK   rL   igrM   rN   rO   sffzsff-trimrQ   tab)rP   rR   )r   rU   ior   r   r0   r   Bio.Filer   r   r   r   rB   rF   rm   rz   r   r   r   r   r   _FormatToRandomAccessr   r   r   r   <module>   sp   ]I>:"[7$l	
