o
    RŀgR                     @   s>  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddddZedZedZdd Zd,ddZdd Zdd Zdd Zdd Zdd  Zd!d" Zd-d$d%ZG d&d' d'eZ G d(d) d)eZ!e"d*krdd+l#m$Z$ e$  dS dS ).zGBio.SearchIO abstract base parser for Exonerate standard output format.    N)ABC)abstractmethodreduce)Optional)Type)SearchIndexer)Hit)HSP)HSPFragment)QueryResult)seq1   )+-.z(#+)z[53ISCF]c                 C   s0   | j d d | j | _| jd d | j | _dS )z%Set the HSPFragment frames (PRIVATE).   r   N)	hit_start
hit_strand	hit_framequery_startquery_strandquery_frame)frag r   R/var/www/html/myenv/lib/python3.10/site-packages/Bio/SearchIO/ExonerateIO/_base.py
_set_frame   s   r   c                    sd   | d| }| |d  t  d }|sdn d| d } fddtt  d D }|||fS )a	  Select a valid amino acid sequence given a 3-letter code input (PRIVATE).

    This function takes a single three-letter amino acid sequence and the phase
    of the sequence to return the longest intact amino acid sequence possible.
    Parts of the input sequence before and after the selected sequence are also
    returned.

    This is an internal private function and is meant for parsing Exonerate's
    three-letter amino acid output.

    >>> from Bio.SearchIO.ExonerateIO._base import _make_triplets
    >>> _make_triplets('GlyThrSerAlaPro')
    ('', ['Gly', 'Thr', 'Ser', 'Ala', 'Pro'], '')
    >>> _make_triplets('yThrSerAla', phase=1)
    ('y', ['Thr', 'Ser', 'Ala'], '')
    >>> _make_triplets('yThrSerAlaPr', phase=1)
    ('y', ['Thr', 'Ser', 'Ala'], 'Pr')

    Nr    r   c                    s$   g | ]} d | d |d   qS )r   r   r   ).0inp_seqr   r   
<listcomp><   s   $ z"_make_triplets.<locals>.<listcomp>)lenrange)seqphaseprenon_tripletspostintactsr   r!   r   _make_triplets$   s   
r,   c                 C   s&   | sg S dg}t dd | dd |S )a  Return the letter coordinate of the given list of fragments (PRIVATE).

    This function takes a list of three-letter amino acid sequences and
    returns a list of coordinates for each fragment had all the input
    sequences been flattened.

    This is an internal private function and is meant for parsing Exonerate's
    three-letter amino acid output.

    >>> from Bio.SearchIO.ExonerateIO._base import _get_fragments_coord
    >>> _get_fragments_coord(['Thr', 'Ser', 'Ala'])
    [0, 3, 6]
    >>> _get_fragments_coord(['Thr', 'SerAlaPro', 'GlyLeu'])
    [0, 3, 12]
    >>> _get_fragments_coord(['Thr', 'SerAlaPro', 'GlyLeu', 'Cys'])
    [0, 3, 12, 18]

    r   c                 S   s   | | d t | g S )Nr   )r$   )accr   r   r   r   <lambda>W   s    z&_get_fragments_coord.<locals>.<lambda>Nr   r   )fragsinitr   r   r   _get_fragments_coord@   s   r1   c                 C   s   dd t | D S )ao  Return the phases of the given list of 3-letter amino acid fragments (PRIVATE).

    This is an internal private function and is meant for parsing Exonerate's
    three-letter amino acid output.

    >>> from Bio.SearchIO.ExonerateIO._base import _get_fragments_phase
    >>> _get_fragments_phase(['Thr', 'Ser', 'Ala'])
    [0, 0, 0]
    >>> _get_fragments_phase(['ThrSe', 'rAla'])
    [0, 1]
    >>> _get_fragments_phase(['ThrSe', 'rAlaLeu', 'ProCys'])
    [0, 1, 0]
    >>> _get_fragments_phase(['ThrSe', 'rAlaLeuP', 'roCys'])
    [0, 1, 2]
    >>> _get_fragments_phase(['ThrSe', 'rAlaLeuPr', 'oCys'])
    [0, 1, 1]

    c                 S   s   g | ]
}d |d   d  qS )r   r   r   xr   r   r   r#   m   s    z(_get_fragments_phase.<locals>.<listcomp>)r1   )r/   r   r   r   _get_fragments_phaseZ   s   r4   c                 C   s  ddd}| d j }| d j}t| }t| |D ]\}}|jdks(|jdks(J |jdkr/dnd}||_t|jj	}t
||\}	}
}t|jj	}t
||\}}}|rTdnd}|rZdnd}td||d	}|t||  }|t|dd|  }|	r}dnd}|rdnd}td|
|d	}|t| }|t|dd }d
|_d
|_|| | |_|| | |_|jdkr|||_|_n|jdkr|||_ |_|j D ]!\}}t
||\}}}ttd
|g| ttd
|g |j|< q||}}q| S )zTransform 3-letter AA codes of input fragments to one-letter codes (PRIVATE).

    Argument fraglist should be a list of HSPFragments objects.
    *r   )z***z<->r   r   r   Xr   )
custom_mapN)r   r   r4   zipr   r   r'   strqueryr&   r,   hitr   joinr$   replace	query_endhit_endaln_annotationitemslistfilter)fraglistr7   
hsp_hstart
hsp_qstartfrag_phasesr   r'   hstepqseqq_triplets_pre
q_tripletsq_triplets_posthseqh_triplets_pre
h_tripletsh_triplets_post	hseq1_pre
hseq1_posthseq1hstarthend	qseq1_pre
qseq1_postqseq1qstartqendannotannotseqr(   intactr*   r   r   r   _adjust_aa_seqp   sH   




"
r^   c                    sF  | j d }|ddksJ g }| jdkrdnd}| jdkr dnd}|dkr+t| jnt| j}|dkr9t| jnt| j}d}|r!zt	t
|d}||}	|	t| }
| |||	   W n tyx   d}d}	t|}
| |d  Y nw ||}}|t t fdd	d
D  | 7 }|t t fdd	d
D  | 7 }t|| _t|| _t|| _t|| _t||	 ||
 }t| j dkr| | jj| | jjf}nt| j dkr| | j d | | j d f}d|d v r|t|| 7 }nd|d v r|t|| 7 }t  |  ||
d }||
7 }|sC|S )zLSplit one HSPFragment containing frame-shifted alignment into two (PRIVATE).
similarity#r   r   r   r   Nc                 3       | ]
} j j|V  qd S N)r:   r&   countr2   splitr   r   	<genexpr>       z"_split_fragment.<locals>.<genexpr>)r   <>c                 3   ra   rb   )r;   r&   rc   r2   rd   r   r   rf      rg      r   query_annotationhit_annotation)r@   rc   r   r   minquery_rangemax	hit_rangeresearch
_RE_SHIFTSgroupfindr$   AttributeErrorsumr   r   r?   r>   slicer:   r&   r;   r   append)r   similsplit_fragsqsteprH   qposhposabs_posshiftss_starts_stoprY   rT   	abs_sliceseqsr   rd   r   _split_fragment   sd   



/r   c              	   C   s  g }t |d D ]\}}|d}|du rdn|| }|d}|du r&dn|| }	t| |||	d}
|d |
_|d |
_|d	 | d |
_|d	 | d |
_z|d
i }| D ]\}}|| |
j|< qYW n	 t	yo   Y nw |d |
_
|d |
_|
jddurd|
jd v r|t|
 qt|
jdks|
j
dksd|v rtt|d rt|
 ||
 qt|d jdkrt|}t|}dD ]}||v rt||||  q|S )zHReturn a list of HSP objects from the given parsed HSP values (PRIVATE).query_rangesr;   Nr   r:   )r;   r:   r   r   
hit_rangesr@   r   r   r_   r`   vulgar_comprj   )scorehit_split_codonsquery_split_codonsmodelr   
cigar_compmolecule_type)	enumerategetr   r   r>   r   r?   rA   r@   
IndexErrorr   r   extendr   r$   rq   rr   	_RE_TRANSr   ry   r^   r
   setattr)hidqidhspdr/   idxqcoordshseqlistrM   qseqlistrI   r   	aln_annotkeyvaluehspattrr   r   r   _create_hsp   sL   






	r   c                 C   sJ   z|  dd\}}}W ||fS  ty$   |  dd\}}d}Y ||fS w )zAParse the 'Query:' line of exonerate alignment outputs (PRIVATE). rj   r   r   )re   
ValueError)linemarkiddescr   r   r   _parse_hit_or_query_line3  s   r   Tc                 C   sZ   |rd| fS d}|  dr|  drdnd}|sd| fS |r)d| dt|  fS d| fS )a8  Determine the strand from the description (PRIVATE).

    Exonerate appends ``:[revcomp]`` (versions <= 2.2) or ``[revcomp]``
    (versions > 2.2) to the query and/or hit description string. This function
    outputs '-' if the description has such modifications or '+' if not. If the
    query and/or hit is a protein sequence, a '.' is output instead.

    Aside from the strand, the input description value is also returned. It is
    returned unmodified if ``modify_desc`` is ``False``. Otherwise, the appended
    ``:[revcomp]`` or ``[revcomp]`` is removed.

    r   r   z	[revcomp]z
:[revcomp]r   r   N)endswithr$   )r   
is_proteinmodify_descsuffixr   r   r   _get_strand_from_desc>  s   
r   c                   @   sV   e Zd ZU dZdZee ed< dd Zdd Z	dd	 Z
ed
d Zdd Zdd ZdS )_BaseExonerateParserz2Abstract base class iterator for exonerate format.N	_ALN_MARKc                 C   s   || _ d| _d S )NF)handlehas_c4_alignment)selfr   r   r   r   __init__`  s   
z_BaseExonerateParser.__init__c                 c   s    	 | j  | _| jdr| jsd| _| jds&| jds&| jdr'n| jr0| jdr2d S q|  D ]}d|_|j|_|D ]}|j|_qB|V  q7d S )NTC4 Alignment:zvulgar:zcigar:-- completed 	exonerate)r   readliner   
startswithr   _parse_qresultprogramdescription)r   qresultr;   r   r   r   __iter__d  s,   



z_BaseExonerateParser.__iter__c                 C   s$   	 | j r	|| j rdS | j | _ q)z@Read the file handle until the given bool function returns True.TN)r   r   r   )r   	bool_funcr   r   r   
read_until|  s
   z_BaseExonerateParser.read_untilc                 C   s   t rb   NotImplementedError)r   headerr   r   r   parse_alignment_block  s   z*_BaseExonerateParser.parse_alignment_blockc           
      C   s  g }| j  r|| j   | j | _ | j  si i i }}}|D ]o}|dr6t|\|d< |d< q$|drFt|\|d< |d< q$|drV|ddd |d< q$|d	rf|dd
d
 |d< q$|dr}|ddd
dd
 \|d< |d< q$|dr|ddd
dd
 \|d< |d< q$t|d d|d v dd\}}||d< ||d< t|d d|d v dd\}}	||d< |	|d< |||dS )NQuery:r   r   zTarget:zModel:r   r   r   z
Raw score:rj   r   zQuery range:      r   r>   zTarget range:r   r?   protein2T)r   r   r   r   2proteinr   r   r;   r   )	r   stripry   r   r   r   r   re   r   )
r   
aln_headerr   r;   r   r   qresult_strandqresult_descr   hit_descr   r   r   _parse_alignment_header  sF   






$
"



z,_BaseExonerateParser._parse_alignment_headerc                 #   s
   d}d}d}d}d}d\}}d }d\}	}
d\}}d\}}g g }} j r(d _	   fd	d
 |d ur<|}|	}|
} jrq j jsKJ  ji i i d} j r_ dd
    } |}|d d }	|d d }
n jrz jdr|}d\}	}
||	kr|}n|}||
ks||kr|}n|}|d urt|||d }|| ||krt	|}|d 
 D ]
\}}t||| q|| g }||ks||krt|d}|D ]}|| q|d 
 D ]
\}}t||| q|V  ||krd S g } j s j  _q))Nr   r   r   rj   r   )NNr   Tc                    s   |   jS rb   )r   r   r   r   r   r   r.     s    z5_BaseExonerateParser._parse_qresult.<locals>.<lambda>r   c                 S   s   |   dS )Nr   )r   r   r   r   r   r   r.     s    r   r   r;   r   r   )r   )r   r   r   r   r   r   r   r   ry   r	   rA   r   r   absorbr   r   )r   	state_EOFstate_QRES_NEWstate_QRES_SAMEstate_HIT_NEWstate_HIT_SAME
qres_state	hit_state
file_statecur_qidcur_hidprev_qidprev_hidcurprevhit_listhsp_listr   r   r;   r   r   r   r   r   r   r     sx   




z#_BaseExonerateParser._parse_qresult)__name__
__module____qualname____doc__r   r   r9   __annotations__r   r   r   r   r   r   r   r   r   r   r   r   [  s   
 
7r   c                   @   sF   e Zd ZU dZdZeee  ed< dZ	ee
 ed< dd Zdd ZdS )	_BaseExonerateIndexerz'Indexer class for Exonerate plain text.N_parser_query_markc                 C   s   t d)NzShould be defined by subclassr   )r   posr   r   r   get_qresult_id  s   z$_BaseExonerateIndexer.get_qresult_idc                 c   s    | j }|d d}	 | }| }|| jrA|du r&| |}|}n'| |}||kr@|||| fV  |}|}|| n|sM|||| fV  dS q)zCIterate over the file handle; yields key, start offset, and length.r   N)_handleseektellr   r   r   r   )r   r   qresult_keystart_offsetr   qresult_offsetcurr_keyr   r   r   r     s,   



z_BaseExonerateIndexer.__iter__)r   r   r   r   r   r   r   r   r   r   bytesr   r   r   r   r   r   r     s   
 r   __main__)run_doctest)r   )T)%r   rq   abcr   r   	functoolsr   typingr   r   Bio.SearchIO._indexr   Bio.SearchIO._modelr	   r
   r   r   Bio.SeqUtilsr   _STRAND_MAPcompilers   r   r   r,   r1   r4   r^   r   r   r   r   r   r   r   
Bio._utilsr   r   r   r   r   <module>   s@   


AB@
 8%
