o
    Rŀgu                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
 dd	lmZ dd
lmZ ddlmZ ede 							d$ddZdd Zd%ddZ	d&ddZdd Zdd Z	d&ddZd d! Zed"kr~dd#lmZ e  dS dS )'z'Code for dealing with Codon Alignments.    N)Iterable)Mapping)BiopythonExperimentalWarning)BiopythonWarning)CodonAlignment)mktest)CodonSeq
CodonTable)	SeqRecordztBio.codonalign is an experimental module which may undergo significant changes prior to its future official release.-XF
   c	              
   C   s  ddl m}	 t| |	stdt| }
|du rRzt|}W n ty.   t|}t|}Y nw |
|kr>td|
 d| dt|trFd}nMt|trNd}nEtd	t|t	s[td
t||
krt|trgni }|D ]}|j
}||v r|td| d|||< qk|}d}ntdt| d|
 d|dkrt| |}ni|dkrt| }dd | D }|| r|| }tdd| dg }| D ]}||||j
 f qn3|dkrg }| D ])}z||j
 }W n ty   td|j
 d td Y nw |||| f q|du rtjd }g }d}|D ]B}t|d |d ||||d}|s:td|d j
 d|d j
 dt|d |d |||||d}|| |d dkrWd}q|rbtt|S t|S )a\  Build a codon alignment from protein alignment and corresponding nucleotides.

    Arguments:
     - pro_align  - a protein MultipleSeqAlignment object
     - nucl_seqs - an object returned by SeqIO.parse or SeqIO.index
       or a collection of SeqRecord.
     - corr_dict  - a dict that maps protein id to nucleotide id
     - complete_protein - whether the sequence begins with a start
       codon

    Return a CodonAlignment object.

    The example below answers this Biostars question: https://www.biostars.org/p/89741/

    >>> from Bio.Seq import Seq
    >>> from Bio.SeqRecord import SeqRecord
    >>> from Bio.Align import MultipleSeqAlignment
    >>> from Bio.codonalign import build
    >>> seq1 = SeqRecord(Seq('ATGTCTCGT'), id='pro1')
    >>> seq2 = SeqRecord(Seq('ATGCGT'), id='pro2')
    >>> pro1 = SeqRecord(Seq('MSR'), id='pro1')
    >>> pro2 = SeqRecord(Seq('M-R'), id='pro2')
    >>> aln = MultipleSeqAlignment([pro1, pro2])
    >>> codon_aln = build(aln, [seq1, seq2])
    >>> print(codon_aln)
    CodonAlignment with 2 rows and 9 columns (3 codons)
    ATGTCTCGT pro1
    ATG---CGT pro2

    r   )MultipleSeqAlignmentz:the first argument should be a MultipleSeqAlignment objectNz2Higher Number of SeqRecords in Protein Alignment (z,) than the Number of Nucleotide SeqRecords (z) are found!   zBNucl Sequences Error, Unknown type to assign correspondence methodzHcorr_dict should be a dict that corresponds protein id to nucleotide id!zDuplicate key ''   zNumber of items in corr_dict (z*) is less than number of protein records ()c                 S   s   h | ]}|j qS  id.0ir   r   K/var/www/html/myenv/lib/python3.10/site-packages/Bio/codonalign/__init__.py	<setcomp>   s    zbuild.<locals>.<setcomp>Protein Record z, z= cannot find a nucleotide sequence match, please check the idzProtein record (z) is not in corr_dict!F)gap_charcodon_tablecomplete_protein
anchor_len and Nucleotide Record  do not match!)r   r   r   	max_scoreT)	Bio.Alignr   
isinstance	TypeErrorlentuple
ValueErrorr   r   dictr   RuntimeErrorzipsetkeysjoinappendKeyErrorprintexitr
   generic_by_id_check_corr_get_codon_recr   _align_shift_recs)	pro_align	nucl_seqs	corr_dictr   unknownr   r   r    r#   r   pro_numnucl_numcorr_methoddrecordkeypro_nucl_pairnucl_idpro_iddiffpro_rec	codon_alnshiftpair	corr_span	codon_recr   r   r   build   s   ,









	rL   c                 C   sT   d}t |  D ]!}tt|dkr|dt|7 }q|ddt| d 7 }q|S )zFGenerate regular expression based on a given list of codons (PRIVATE). r   [])r,   r'   r-   r/   )codonsregr   r   r   r   
_codons2re   s   rR   *c                 C   s   ddl m} t| |stdi }| j D ]\}}||g | q| D ]
\}}t|||< q(t| j	||< d||< |S )a  Set up the regular expression of a given CodonTable (PRIVATE).

    >>> from Bio.Data.CodonTable import generic_by_id
    >>> p = generic_by_id[1]
    >>> t = _get_aa_regex(p)
    >>> print(t['A'][0])
    G
    >>> print(t['A'][1])
    C
    >>> print(sorted(list(t['A'][2:])))
    ['A', 'C', 'G', 'T', 'U', '[', ']']
    >>> print(sorted(list(t['L'][:5])))
    ['C', 'T', 'U', '[', ']']
    >>> print(sorted(list(t['L'][5:9])))
    ['T', 'U', '[', ']']
    >>> print(sorted(list(t['L'][9:])))
    ['A', 'C', 'G', 'T', 'U', '[', ']']

    r   r	   z;Input table is not a instance of Bio.Data.CodonTable objectz...)
Bio.Data.CodonTabler
   r%   r&   forward_tableitems
setdefaultr0   rR   stop_codons)r   stopr;   r
   aa2codoncodonaarP   r   r   r   _get_aa_regex   s   
r]   c                    s&  ddl }t| trt|tstdt|}d}| jD ]}	|	|kr'|||	 7 }qt|j |d}
|	||
}|rA|
 dfS t| j|d fddtdt D }t|d  k rl|d |d  |d< g }d}g }t|D ]\}}t|}d}d}| kr|D ]"}	|r|dkr|t|j7 }||d	 7 }q|||	 7 }||d	 7 }q|	||
}n&| krd}d}t t|D ]}||||  7 }||d	 7 }q|	||
}|r|| | |f | kr|| qv|| qv| kr|| qv|| qvd|}|	||
}|r|
 d
fS d}d}|r|d d dkrd
dd  d d  d
 dg}|d }|D ]p}|dkrKd} nf|dv rW d | }n|d  d d  d
 fv rp d d  |  }|d d |kr|
|d d | |d d  }n
|
d|d d  }t|||| |\}}|dur|dkr||d<  nq@|dkrtd|j t tt|d
 D ]x}||d
  d || d  d   }d||| d ||d
  d  }|
|| d ||d
  d  }d}|dkrt|||| |\}}|dur0|dkr0|g||| d ||d
  d < d}q|dkr?td|j t q|d d d
 t|d
 kr|d }t|}d
dd| d d| d
 dg}|D ]x}|dkrtd} nn|dv r|d | }n|d| d d| d
 fv r|d d| |  }t|
|d d  |kr|
|d d |d d |  }n
|
|d d d }t||||||\}}|dur|dkr|  ||d<  nqi|dkrtd|j t d|}|	||
}|r|
 d|fS td| j d|j d)znCheck if the nucleotide can be translated into the protein (PRIVATE).

    Expects two SeqRecord objects.
    r   NzB_check_corr accepts two SeqRecord object. Please check your input.rM   c                    s   g | ]
}||   qS r   r   r   r    pro_seqr   r   
<listcomp>  s    z_check_corr.<locals>.<listcomp>r   r   Tr      )r   r   z&first frameshift detection failed for z'middle frameshift detection failed for z%last frameshift detection failed for zProtein SeqRecord (z) and Nucleotide SeqRecord (z) do not match!)rer%   r   r&   r]   seqstrupperreplacesearchspanranger'   	enumeraterR   start_codonsr0   startendr/   _get_shift_anchor_rewarningswarnr   r   popr+   )pronuclr   r   r   r    rd   aa2repro_rer\   nucl_seqmatchanchorsanchor_distance
anchor_posr   anchorthis_anchor_lenqcodonfncodonlast_qcodonlast_fcodonjfull_pro_refirst_anchorshift_id_posshift_val_lstsh_anc	shift_val
sh_nuc_lensh_nucr   r^   r   r5      s2  






$& 
"












r5   c              
   C   s  ddl }dd tddD }d|  k rd| d k rsn nUtt| D ]F}d	}	t| D ]\}
}|
|krC|	|| d
 ||  d 7 }	q,|	|| 7 }	q,|	d7 }	||	|}|rj|	d	ddd}	|d7 }|	|f  S q$|sqd|fS dS |d| d d| d fv rd| | }tdt| D ]S}d	}	t| D ]*\}
}|
|d krq|
|kr|	t| |d  | | ||||  7 }	q|	|| 7 }	q|	d7 }	||	|}|r|	d	ddd}	|d7 }|	|f  S q|sd|fS dS dS )a  Find a regular expression matching a potentially shifted anchor (PRIVATE).

    Arguments:
     - sh_anc    - shifted anchor sequence
     - sh_nuc    - potentially corresponding nucleotide sequence
       of sh_anc
     - shift_val - 1 or 2 indicates forward frame shift, whereas
       3*anchor_len-1 or 3*anchor_len-2 indicates
       backward shift
     - aa2re     - aa to codon re dict
     - anchor_len - length of the anchor
     - shift_id_pos - specify current shift name we are at

    r   Nc                 S   s   g | ]}t |qS r   )chrr   r   r   r   r`         z(_get_shift_anchor_re.<locals>.<listcomp>a   k   rc   r   ^(?P<z>..*)$rM   r   ra   )rd   rk   r'   rl   ri   rh   _merge_aa2rerg   )r   r   r   rv   r    r   rd   shift_idr   r   kr\   ry   r   r   r   rp     s\    

rp   c                 C   sn  dd }t t|||  || f}|dkr_dt|d d t|d d @ }d| d }|d	|d d  d
 d	 |d d  d
 d	 | d
 d	 |d d  d
 d	 |d d  d
 7 }nR|dkrdt|d d t|d d @ }	dt|d d t|d d @ }
d| d }|d	|d d  d
 d	 |	 d
 d	 |
 d
 d	 |d d  d
 7 }|d7 }|S )zDMerge two amino acids based on detected frame shift value (PRIVATE).c                 S   sh   g }d}| D ]+}|dkrd}| d q|dkrd}q|dkr(|d | |d< q|dkr1| | q|S )Nr   rN   ra   rM   rO   )r0   )re_aaaasmr   r   r   r   get_aa_from_codonre   s   
z)_merge_aa2re.<locals>.get_aa_from_codonrer   rM   r   r   r   >rN   rO   r   )listmapr/   r-   )aa1aa2r   rv   reidr   scodon	intersectscodonre
intersect1
intersect2r   r   r   r     s   &

	


&&
	

r   c                 C   s  ddl }ddlm} |j|d}	|d }
|d }t|}|dv rt| j|dd |
d |
d  krAtd| j d	|j d
d}t	 }| jD ]}|dkrT|d7 }qI|r|dkr|	|
d |
d d  }|
t|jt| s|d8 }td| j d| d| d|j d| dt |dkrtd|j d||7 }|d7 }qI|	|
d d|  |
d d|d    }| j|d|kr|d8 }td| j|||j|f t |dkrtd|j d||7 }|d7 }qIt||jdS |dkrddlm} |g }g }|d }t|  }|D ]}||| ||| qg }| }	 || |d7 }||v rm|||  rm||}d|| d || d   }|| ||d |  || d }n||v r|||  r||| d }||  krnq-t	 }d}| jD ]}|dkr|d7 }q|r|dkr|	|d |d d  }|
t|jt| s|d8 }td| j d| d| d|j d| dt ||7 }|d7 }q|t| jddd k r%||d  ||  d dk r%|d8 }|| }|d|  }|}|	|| d|  }ny|| ||d   d dkri|d8 }||d  d }|| }d|| ||d   d  }|	|| d|  |	|| || d   }n5|| }|d }|	|| }| j|d|kr|d8 }td| j d| d| d|j d| dt |dkrtd|j d||7 }|d7 }q||_!t||jdS dS )a3  Generate codon alignment based on regular re match (PRIVATE).

    span_mode is a tuple returned by _check_corr. The first element
    is the span of a re search, and the second element is the mode
    for the match.

    mode
     - 0: direct match
     - 1: mismatch (no indels)
     - 2: frameshift

    r   N)SeqrM   r   )r   r   rc   r   r!   r"   r   ---zstart codon of z ( z) does not correspond to r   zmax_score reached for z=! Please raise up the tolerance to get an alignment in anyway)tablez'%s(%s %d) does not correspond to %s(%s)r   r   )dequeT   (z	Codon of )"rd   Bio.Seqr   re   rh   r]   r'   r)   r   r   ri   rR   rm   rf   rg   rq   rr   r   r+   	translater   collectionsr   r   	groupdictr.   r0   rj   rn   indexisupperislowerro   rf_table)rt   ru   	span_moder   r   r   r#   rd   r   rx   rj   moderv   aa_num	codon_seqr\   
this_codonr   	shift_posshift_startry   m_groupdictr   r   shift_indexr   rn   ro   ngapr   r   r   r6   ;  s  &



(







r6   c                    s2  dd }dd | D }dgt |  }t| D ]1\}}|j D ]'ttr/||  d7  < q|jttd  dkrF||  d7  < qqt t|dkrTtd	dt | }	 g }zfdd|D }W n
 tyr   Y | S w t|D ]\}	}|	|	t|f t|t
r| |	 jt|t|d  dkr||||	 \}
}|
| d dkrd|
| d   nd  dkrdt  }t| |	 jjd}|| |	 jdt| | | |	 jt|d  7 }||	 }||}|d|  fdd||d d D  }|||	< || |	 _|   |
| 7  |d 7 qwt ||kr|D ]l\}	}t| |	 jjd}dt  }|| |	 jdt| | | |	 jt|d  7 }||	 }||}g }tdt |dD ]}|	|| d  qg|d| |  fdd||d D  }|||	< || |	 _q'd7 q[)zBuild alignment according to the frameshift detected by _check_corr (PRIVATE).

    Argument:
     - recs - a list of SeqRecords containing a CodonSeq dictated
       by a rf_table (with frameshift in some of them).

    c                 S   s<   | | }d}	 t|||  tr|||  |fS |d7 }q)Nr   Tr   )r   r%   int)r   lstidxpr   r   r   find_next_int  s   
z(_align_shift_recs.<locals>.find_next_intc                 S   s   g | ]}|j  qS r   )re   get_full_rf_table)r   recr   r   r   r`     s    z%_align_shift_recs.<locals>.<listcomp>r   r   rc   r   z3Number of alignable codons unequal in given recordsTc                    s   g | ]}|  qS r   r   )r   r   )r   r   r   r`     r   r   )r   Nc                       g | ]}|t   qS r   r   r   vgap_numr   r   r`   	  s    g      @c                    r   r   r   r   r   r   r   r`     s    )r'   rl   re   r   r%   r   r-   r+   
IndexErrorr0   floatr   r   r   rs   rk   )recsr   full_rf_table_lstrf_numr   r   rec_numadd_lst
col_rf_lstr   r   r   gapsre   full_rf_tablebpinter_rftr   )r   r   r   r7     s   	
)0
4

4

r7   __main__)run_doctest)Nr   r   NFr   r   )rS   r   )Fr   )__doc__rq   collections.abcr   r   Bior   r   Bio.codonalign.codonalignmentr   r   Bio.codonalign.codonseqr   Bio.Datar
   Bio.SeqRecordr   rr   rL   rR   r]   r5   rp   r   r6   r7   __name__
Bio._utilsr   r   r   r   r   <module>   sN   

 .
#
 DD?
 P
