o
    Rŀg                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZ ddlZddlmZ ddlmZ dDddZdd ZdEddZdd Zdd Zdd Zdd Zdd  Zd!d" ZdFd$d%Zd&d' Z d(d) Z!d*d+ Z"d,d- Z#d.d/ Z$d0d1 Z%d2d3 Z&dGd4d5Z'dHd6d7Z(d8d9 Z)d:d; Z*d<d= Z+d>d? Z,d@dA Z-e.dBkrddCl/m0Z0 e0  dS dS )Iz5Code for performing calculations on codon alignments.    N)Counter)defaultdict)heapify)heappop)heappush)permutations)erfc)floorlog)sqrt)	Alignment)
CodonTableNG86   c                    s  |du rd}n|dur|dkrt d|dvrt d|du r$tjd }g }g }| j\zjW n	 ty;   Y nw tzjW n	 tyN   Y nw t| j\}}t||D ],\}	}
|	\}}|
\}}|	fdd	t
||d
D  |	fdd	t
||d
D  q]h d |D ]}t fdd	|D st d| dq|D ]}t fdd	|D st d| dq|dkrt||||S |dkrt||||S |dkrt|||S |dkrt|||S t d| d)ac  Calculate dN and dS of the given two sequences.

    Available methods:
        - NG86  - `Nei and Gojobori (1986)`_ (PMID 3444411).
        - LWL85 - `Li et al. (1985)`_ (PMID 3916709).
        - ML    - `Goldman and Yang (1994)`_ (PMID 7968486).
        - YN00  - `Yang and Nielsen (2000)`_ (PMID 10666704).

    .. _`Nei and Gojobori (1986)`: http://www.ncbi.nlm.nih.gov/pubmed/3444411
    .. _`Li et al. (1985)`: http://www.ncbi.nlm.nih.gov/pubmed/3916709
    .. _`Goldman and Yang (1994)`: http://mbe.oxfordjournals.org/content/11/5/725
    .. _`Yang and Nielsen (2000)`: https://doi.org/10.1093/oxfordjournals.molbev.a026236

    Arguments:
     - k  - transition/transversion rate ratio
     - cfreq - Current codon frequency vector can only be specified
       when you are using ML method. Possible ways of
       getting cfreq are: F1x4, F3x4 and F61.

    NF3x4MLz8cfreq can only be specified when you are using ML method)F1x4r   F61z&cfreq must be 'F1x4', 'F3x4', or 'F61'r   c                 3        | ]} ||d   V  qdS    N .0i)	sequence1r   F/var/www/html/myenv/lib/python3.10/site-packages/Bio/Align/analysis.py	<genexpr>I       z"calculate_dn_ds.<locals>.<genexpr>r   c                 3   r   r   r   r   )	sequence2r   r   r   J   r      ACGTc                 3       | ]}| v V  qd S Nr   r   
nucleotidebasesr   r   r   M       zUnrecognized character in z8 in the target sequence (Codons consist of A, T, C or G)c                 3   r&   r'   r   r(   r*   r   r   r   S   r,   z7 in the query sequence (Codons consist of A, T, C or G)r   LWL85YN00zUnknown method '')
ValueErrorr   generic_by_id	sequencesseqAttributeErrorstralignedzipextendrangeall_ml_ng86_lwl85_yn00)	alignmentmethodcodon_tablekcfreqcodons1codons2aligned1aligned2block1block2start1end1start2end2codon1codon2r   )r+   r   r    r   calculate_dn_ds   sf   




 "

rP   c              	   C   s   t | ||d\}}t |||d\}}|| d }|| d }	ddg}
t| |D ]\}}dd t|
t|||dD }
q'|
d | }|
d |	 }|dk rXtd	tdd
|   }nd}|dk rntd	tdd
|   }||fS d}||fS )z$NG86 method main function (PRIVATE).)rA   rB          @r   c                 S      g | ]\}}|| qS r   r   r   mnr   r   r   
<listcomp>q       z_ng86.<locals>.<listcomp>rA   r   g      ?      UUUUUU?)_count_site_NG86r7   _count_diff_NG86absr   )rD   rE   rB   rA   S_sites1N_sites1S_sites2N_sites2S_sitesN_sitesSNrN   rO   pspndSdNr   r   r   r<   i   s*   r<   c                 C   s  d}d}d}d}d}| D ]}g g d}	| dd}t|D ]\\}
}|D ]U}||kr*q#||v rG||v rGt|}|||
< d|}|	d	 | q#||v rd||v rdt|}|||
< d|}|	d	 | q#t|}|||
< d|}|	d
 | q#q|j| }d }}|	d	 D ]}||jv r||7 }q|j| |kr||7 }q||7 }q|	d
 D ]}||jv r|d7 }q|j| |kr|d7 }q|d7 }q|| d }||| 7 }||| 7 }q||fS )zCount synonymous and non-synonymous sites of a list of codons (PRIVATE).

    Arguments:
     - codons - A list of three letter codons.
     - k - transition/transversion rate ratio.

    r   r"   r$   r%   r#   r"   r%   r#   r$   )
transitiontransversionUr%    rm   rn   r   r   )replace	enumeratelistjoinappendforward_tablestop_codons)codonsrA   rB   S_siteN_sitepurine
pyrimidiner+   codonneighbor_codonr   r)   basecodon_chars
this_codonaathis_codon_N_sitethis_codon_S_siteneighbor
norm_constr   r   r   r\      sZ   












r\   c              
   C   s  ddg}| |kr
|S dd t t| |D }ddd}t|dkr2dd t||| ||dD }|S t|d	krq|D ]4}| d
| ||  | |d d
  }dd t||| ||ddD }dd t|||||ddD }q:|S t|dkrttg dd}g }	|D ]`\}
}}| d
|
 ||
  | |
d d
  }|d
| ||  ||d d
  }|	||f dd t||| ||ddD }dd t|||||ddD }dd t|||||ddD }q|S )zCount differences between two codons, three-letter string (PRIVATE).

    The function will take multiple pathways from codon1 to codon2
    into account.
    r   c                 S       g | ]\}\}}||kr|qS r   r   r   r   nucleotide1nucleotide2r   r   r   rV      
    
z$_count_diff_NG86.<locals>.<listcomp>r   c                 S   sF   d }}t tt|jj| |gdkr||7 }||fS ||7 }||fS )z4Compare two codon accounting for different pathways.r   r   )lensetmaprv   get)rN   rO   rA   weightsdndr   r   r   compare_codon   s   z'_count_diff_NG86.<locals>.compare_codonc                 S   rR   r   r   r   r   jr   r   r   rV      rW   rX      Nc                 S   rR   r   r   r   r   r   r   rV      rW         ?)rA   r   c                 S   rR   r   r   r   r   r   r   rV      rW   r   r   r   r   c                 S   rR   r   r   r   r   r   r   rV      rW   gUUUUUU?r   c                 S   rR   r   r   r   r   r   r   rV     rW   c                 S   rR   r   r   r   r   r   r   rV     rW   r   )rr   r7   r   rs   r   ru   )rN   rO   rA   re   diff_posr   r   
temp_codonpaths	tmp_codonindex1index2index3tmp1tmp2r   r   r   r]      sr   
	4$	"$$r]   c              	   C   s  t |}ddg}ddg}ddg}| | D ]1}|| }|D ](}	|	dkr+|d  d7  < q|	dkr8|d  d7  < q|	dkrD|d  d7  < qqt|d t|d t|d g}
dgd }t| |D ]\}}||krjqadd	 t|t|||d
D }qadd	 t||
d D }|dd }|dd }dd	 t||D }dd	 |D }d|
d |d  |
d |d |d     |
d d|
d    }d|
d |d  |
d |d |d     d|
d  d|
d    }||fS )zlLWL85 method main function (PRIVATE).

    Nomenclature is according to Li et al. (1985), PMID 3916709.
    r   0r   24rQ      c                 S   rR   r   r   r   r   r   r   rV   /  rW   z_lwl85.<locals>.<listcomp>)	fold_dictc                 S   s   g | ]\}}|| qS r   r   r   r   r   r   rV   3  s    r   Nr   c              	   S   sD   g | ]\}}d t ddd|  |   dt ddd|     qS )r         ?r   r   g      ?r
   r   r   r   r   rV   6  s    2c                 S   s$   g | ]}d t ddd|    qS )r   r   r   r   r
   r   r   r   r   rV   :  s   $ )_get_codon_foldsumr7   _diff_codon)rD   rE   rA   codon_fold_dictfold0fold2fold4r}   fold_numfLPQrN   rO   PQr"   Brh   ri   r   r   r   r=     sB   "
@Dr=   c                 C   s
  i }| j }h d}|D ]w}d|v rqd}t|}t|D ]a\}}|t| }	g }
|	D ] }|||< z|
|d|  W q* tyJ   |
d Y q*w |
|| dkrY|d7 }n |
|| dv rg|d7 }n|
|| d	kru|d
7 }ntd|||< q|||< q|S )zFClassify different position in a codon into different folds (PRIVATE).r!   ro   rp   stopr   r   )r   r   r   r   r   z3Unknown Error, cannot assign the position to a fold)	rv   rs   rr   r   ru   rt   KeyErrorcountRuntimeError)rA   
fold_tablerv   r+   r}   foldcodon_base_lstr   r   other_basesr   
other_baser   r   r   r   @  s<   




r   c                 C   sr  d } } } } }}||  }	d}
d}t t| |D ]\}\}}||kr&q||
v rW||
v rW|	| dkr9|d7 }q|	| dkrD|d7 }q|	| dkrO|d7 }qtd|	|  ||v r||v r|	| dkrj|d7 }q|	| dkru|d7 }q|	| dkr|d7 }qtd|	|  |	| dkr|d7 }q|	| dkr|d7 }q|	| dkr|d7 }qtd|	|  ||||||fS )	zCount number of different substitution types between two codons (PRIVATE).

    returns tuple (P0, P2, P4, Q0, Q2, Q4)

    Nomenclature is according to Li et al. (1958), PMID 3916709.
    r   rj   rk   r   r   r   r   zUnexpected fold_num %d)rr   r7   r   )rN   rO   r   P0P2P4Q0Q2Q4r   r{   r|   rU   r   r   r   r   r   r   b  s>   








r   c           /   	      s  ddl m} dddddddddddddddg}t|}tt}tt}| | D ]P}|d |d   d7  < |d |d   d7  < |d |d   d7  < || }	t|	D ]!\}
}|dkrk|||
   d7  < qX|dkry|||
   d7  < qXq*t| }t| }t||D ]\}
}||
 | ||
< ||
 | ||
< qt	| ||d}t
||t
||f}||d  ||d   ||  }td	D ]}
t||
  fd
d||
  D ||
< qtt}t|j |j D ]
}d|vrd||< q| | D ]
}||  d7  < qt| ||||d\}}}t|| |||d\}}}|| d }|| d }ddddddddddg}tdD ]}
dD ]}||
 | ||
 |  d ||
 |< qBq>ddg}t| |D ]\}}dd t|t|||dD }qc|d | } |d | }!t|||  }"tdd|!  tdd|    }#dtdd|"   }$dddg}%tdD ]}&dd t|j |j D }'t|||#|'|}(||(|$ })g d}tt| |}*|* D ]\\}+}, t|+|,|)|'|}- fddt||-D }q|d | |d | f|d | |d	 | ff}g }.t||D ]\}}-|.t
||-dd q|.d d	 | ||  |.d d	 | ||   }$|.d |.d  }#tfddt|.|%D rb|.d |.d f  S |.}%qdS )zsYN00 method main function (PRIVATE).

    Nomenclature is according to Yang and Nielsen (2000), PMID 10666704.
    r   expmr"   r$   r#   r%   r   r   r   r   rX   r   c                       i | ]	\}}||  qS r   r   r   r   rB   totr   r   
<dictcomp>      z_yn00.<locals>.<dictcomp>ro   )rB   rA   rl   c                 S   rR   r   r   rS   r   r   r   rV     rW   z_yn00.<locals>.<listcomp>rZ   rY   h㈵>   c                 S      g | ]}d |vr|qS ro   r   r   r}   r   r   r   rV     s
    r   r   r   r   c                    s   g | ]
\}}||   qS r   r   rS   )r   r   r   rV         T)tc                 3   s$    | ]\}}t ||  k V  qd S r'   )r^   r   )	tolerancer   r   r     s   " z_yn00.<locals>.<genexpr>N)scipy.linalgr   r   r   intrr   r   valuesr7   _get_TV_get_kappa_tr9   itemsrs   rv   keysrw   _count_site_YN00r]   r   _get_Qr   _count_diff_YN00ru   r:   )/rD   rE   rA   r   fcodonr   	fold0_cnt	fold4_cntr}   r   r   r   f0_totalf4_totalr   TVk04kappapir_   r`   bfreqSN1ra   rb   bfreqSN2rd   rc   bfreqSNr   re   rN   rO   rf   rg   pwr   dSdN_pretemprx   r   r   codon_npathr   r   tvdSdNr   )r   r   r   r   r>     s     

(
 0$r>   c                 C   s   d}d}ddg}d}t | |D ]A\}}t ||D ]7\}	}
|	|
kr!n*|	|v r2|
|v r2|d  d7  < n|	|v rC|
|v rC|d  d7  < n|d  d7  < |d7 }qq|d | |d | fS )zGet TV (PRIVATE).

    Arguments:
     - T - proportions of transitional differences
     - V - proportions of transversional differences

    rj   )r#   r%   r   r   )r7   )rD   rE   rA   r{   r|   r   sitesrN   rO   r   r   r   r   r   r      s    

r   Fc           	      C   s  | d | d  | d< | d | d  | d< d| d | d  | d | d    d| d | d  | d  | d  | d | d  | d  | d    d|d d| d  | d      |d	  d| d | d  | d  | d | d  | d     }d|d d| d  | d    }d
t | }d
t | }|| d }|du rd| d | d  | d  | d | d  | d   | | d | d  | d | d     }|S d| d  | d  d|| d    d| d  | d  d|| d     d| d  | d   | }|S )zmCalculate kappa (PRIVATE).

    The following formula and variable names are according to PMID: 10666704
    r%   r#   Yr"   r$   Rr   r   r   g      F   r
   )	r   r   r   r"   r   abkappaF84
kappaHKY85r   r   r   r     sF   "2 
.""r   c                 C   s  t | }|t |ksJ d}d}d}|j}	|j}
tt| |}d }}ddddddddddg}| D ]\}}|d }d }}tdD ]m}|D ]h}|| |krRqI|d| | ||d d  }||
v rgqI|| }|| |v rz||v rz||9 }n|| |v r||v r||9 }|	| |	| kr||7 }|d |  || 7  < qI||7 }|d |  || 7  < qIqE||| 7 }||| 7 }q5d| ||  }||9 }||9 }|D ]}t| }|D ]
}||  |  < qq|||fS )a  Site counting method from Ina / Yang and Nielsen (PRIVATE).

    Method from `Ina (1995)`_ as modified by `Yang and Nielsen (2000)`_.
    This will return the total number of synonymous and nonsynonymous sites
    and base frequencies in each category. The function is equivalent to
    the ``CountSites()`` function in ``yn00.c`` of PAML.

    .. _`Ina (1995)`: https://doi.org/10.1007/BF00167113
    .. _`Yang and Nielsen (2000)`: https://doi.org/10.1093/oxfordjournals.molbev.a026236

    rj   rk   rl   r   r   Nr   )	r   rv   rw   r   r7   r   r9   r   r   )rD   rE   r   rB   rA   lengthr{   r|   r+   
codon_dictr   r   rc   rd   freqSN
codon_pairnpathr}   SNposr   r~   r   r   r   r   r   r   r   r   >  sX    

r   c                    sF  g d} kr
|S dd t t D }ddd}t|dkr4dd t|| |d |D }|S t|d	krŇ fd
d|D }g |D ],}	tt|j |	g}
||
d |
d f ||
d |
d	 f f}|d |d   qHfddD t |D ]@\}} d| |   |d d  }	dd t|| |	||| d	 dD }dd t|| |	||| d	 dD }q|S t|dkrttg dd}g g }|D ]h\}}} d| |   |d d  }|d| |  ||d d  }|||f tt|j ||g}
||
d |
d f ||
d |
d	 f ||
d	 |
d f f}|d |d  |d	   qۇfddD t||D ]M\}	}}dd t|| |	d |d ||d dD }dd t|||	d |	d |d ||d dD }dd t|||	d |d ||d dD }qS|S )a&  Count differences between two codons (three-letter string; PRIVATE).

    The function will weighted multiple pathways from codon1 to codon2
    according to P matrix of codon substitution. The proportion
    of transition and transversion (TV) will also be calculated in
    the function.
    r   c                 S   r   r   r   r   r   r   r   rV     r   z$_count_diff_YN00.<locals>.<listcomp>r   c           	      S   s0  d}d}|j }|j}| |v s||v r<| | |v r$|| |v r$dd|dgS | | |v r6|| |v r6dd|dgS ddd|gS ||  || krn| | |v rV|| |v rV|dddgS | | |v rh|| |v rh|dddgS d|ddgS | | |v r|| |v rdd|dgS | | |v r|| |v rdd|dgS ddd|gS )Nrj   rk   r   )rv   rw   )	rN   rO   diffrA   r   r{   r|   dicr   r   r   r   count_TV  s*   z"_count_diff_YN00.<locals>.count_TVc                 S   rR   r   r   r   r   qr   r   r   rV     rW   r   r   c                    s0   g | ]} d | |   |d d   qS )Nr   r   r   )rN   rO   r   r   rV     s   0 c                       g | ]
}d | t   qS )r   r   r   	path_probr   r   rV     r   Nc                 S   rR   r   r   r	  r   r   r   rV     rW   r   c                 S   rR   r   r   r	  r   r   r   rV     rW   r   r   c                    r  )r   r  r   r  r   r   rV     r   c                 S   rR   r   r   r	  r   r   r   rV     rW   c                 S   rR   r   r   r	  r   r   r   rV     rW   c                 S   rR   r   r   r	  r   r   r   rV     rW   r   )rr   r7   r   rs   r   indexru   r   )rN   rO   r   rx   rA   r   r   r  
tmp_codonsr}   	codon_idxprobrU   r   r   r   r   r   r   r   r   rB   r   )rN   rO   r  r   r   x  s   
D($	-$$  
r   c              
   C   s2  ddl m} t| |||d}tt| |}dd t|j |j D }||||fdd}||g dd	d
dd}	|	j	\}
}}t
|||||}d }}t|D ]@\}}t|D ]7\}}||krz%|j| |j| krv||| |||f  7 }n||| |||f  7 }W qV ty   Y qVw qVqN||
9 }||
9 }||||fdd}||ddgd	ddd}	|	j	\}
}d}t
|||||}d }}t|D ]B\}}t|D ]9\}}||krz%|j| |j| kr||| |||f  7 }n||| |||f  7 }W q ty   Y qw qq|d9 }|d9 }|| }|| }||fS )z"ML method main function (PRIVATE).r   )minimizerX   c                 S   r   r   r   r   r   r   r   rV     
    z_ml.<locals>.<listcomp>c              	   S   s$   t | d | d | d ||||d S )z'Temporary function, params = [t, k, w].r   r   r   rx   rA   _likelihood_funcparamsr   	codon_cntrx   rA   r   r   r   func  s   z_ml.<locals>.func)r   皙?r   zL-BFGS-B)绽|=r   r  )r  
   r   )r@   boundstolc              	   S   s    t | d | d d||||d S )z5Temporary function, params = [t, k]. w is fixed to 1.r   r   r   r  r  r  r   r   r   func_w12  s   z_ml.<locals>.func_w1r   r  )r  r  r   r   )scipy.optimizer  _get_pir   r7   rs   rv   r   rw   xr   rr   r   )rD   rE   cmethodrA   r  r   r  rx   r  opt_resr   rB   r   r   SdNdr   rN   r   rO   r"  rhoSrhoNri   rh   r   r   r   r;     s   



r;   c                    s  i }|dkrGt dd | | D }t|   fdd| D }|j |j D ]}d|vrD||d  ||d   ||d	   ||< q*|S |d
krdddddddddddddddg}| | D ]&}|d |d   d7  < |d |d   d7  < |d	 |d	   d7  < qctdD ]}t||    fdd||  D ||< qt|j |j D ] }d|vr|d |d  |d |d   |d	 |d	   ||< q|S |dkr|j |j D ]
}d|vrd||< q| | D ]
}||  d7  < qt|   fdd| D }|S )zObtain codon frequency dict (pi) from two codon list (PRIVATE).

    This function is designed for ML method. Available counting methods
    (cfreq) are F1x4, F3x4 and F64.
    r   c                 s   s    | ]
}|D ]}|V  qqd S r'   r   )r   r}   r)   r   r   r   r   m  s    z_get_pi.<locals>.<genexpr>c                    r   r   r   r   r   r   r   r   q  r   z_get_pi.<locals>.<dictcomp>ro   r   r   r   r   r   r   c                    r   r   r   r   r   r   r   r     r   r   r  c                    r   r   r   r   r   r   r   r     r   )	r   r   r   r   rv   r   rw   r9   rs   )rD   rE   r&  rA   r   r   r}   r   r   r   r   r$  b  sP   	( .
r$  c           	      C   sH  | |krdS | |j v s||j v rdS | |vs||vrdS d}d}dd tt| |D }t|dkr4dS |j|  |j| krn|d d |v rT|d d |v rT|||  S |d d |v rj|d d |v rj|||  S || S |d d |v r|d d |v r|| ||  S |d d |v r|d d |v r|| ||  S |||  S )aM  Q matrix for codon substitution (PRIVATE).

    Arguments:
     - codon1, codon2  : three letter codon string
     - pi              : expected codon frequency
     - k               : transition/transversion ratio
     - w               : nonsynonymous/synonymous rate ratio
     - codon_table     : Bio.Data.CodonTable object

    r   rj   rk   c                 S   s&   g | ]\}\}}||kr|||fqS r   r   r   r   r   r   rV     s
    
z_q.<locals>.<listcomp>r   r   )rw   rr   r7   r   rv   )	rN   rO   r   rB   r   rA   r{   r|   r  r   r   r   _q  s0       r,  c              
   C   s   t |}t||f}t|D ]\}}t|D ]\}	}
||	kr-t||
| |||d|||	f< qqd}t|D ]+\}}t||ddf  |||f< z|| | |||f   7 }W q5 ty`   Y q5w || }|S )z*Q matrix for codon substitution (PRIVATE).rX   r   N)r   npzerosrr   r,  r   r   )r   rB   r   rx   rA   	codon_numr   i1rN   i2rO   nucl_substitutionsr   r}   r   r   r   r     s$   r   c              	   C   s   ddl m} t|||||}|||  }	d}
t|D ]>\}}t|D ]5\}}||f|v rW|	||f ||  dkrC|
|||f d 7 }
q"|
|||f t|| |	||f   7 }
q"q|
S )z,Likelihood function for ML method (PRIVATE).r   r   )r   r   r   rr   r   )r   rB   r   r   r  rx   rA   r   r   r   
likelihoodr   rN   r   rO   r   r   r   r    s   r  c                 C   s  ddl m} |du rtjd }| j}| j}dd |D }t|}g }g }	t|D ]O}
|g  |	g  t|
D ]0}||
 || g}||
|fddf }t	||}t
|||d\}}||
 | |	|
 | q8||
 d |	|
 d q(|||d	}|||	d	}||fS )
zCalculate dN and dS pairwise for the multiple alignment, and return as matrices.

    Argument:
     - method       - Available methods include NG86, LWL85, YN00 and ML.
     - codon_table  - Codon table to use for forward translation.

    r   )DistanceMatrixNr   c                 S   s   g | ]}|j qS r   )id)r   recordr   r   r   rV         z*calculate_dn_ds_matrix.<locals>.<listcomp>)r@   rA   g        )matrix)Bio.Phylo.TreeConstructionr4  r   r1   r2   coordinatesr   r9   ru   r   rP   )r?   r@   rA   r4  r2   r:  namessize	dn_matrix	ds_matrixr   r   pairwise_sequencespairwise_coordinatespairwise_alignmentdndsdn_dmds_dmr   r   r   calculate_dn_ds_matrix  s4   




rF  c              	   C   s  |du r	t jd }t|d\}}t|}g }| jD ]}z|j}W n	 ty)   Y nw t|}|| qd\}}	}
}t	j
}| j D ]~}t|| }td|dD ]m}dd |D }t|||D ]\}}}||| || d  }|| | q_d	}t }| D ]}t|}t|dkrd
}|| qt|dkrqPt||}t||| }|d	u r|	|7 }	||7 }qP||7 }|
|7 }
qP|}qBt||	|
|gS )a4  McDonald-Kreitman test for neutrality.

    Implement the McDonald-Kreitman test for neutrality (PMID: 1904993)
    This method counts changes rather than sites
    (http://mkt.uab.es/mkt/help_mkt.asp).

    Arguments:
     - alignment    - Alignment of gene nucleotide sequences to compare.
     - species      - List of the species ID for each sequence in the alignment.
       Typically, the species ID is the species name as a string, or an integer.
     - codon_table  - Codon table to use for forward translation.

    Return the p-value of test result.
    Nr   rX   r   r   r   c                 S   s   i | ]}|g qS r   r   )r   keyr   r   r   r   1  r7  zmktest.<locals>.<dictcomp>TF)r   r1   _get_codon2codon_matrixr   r2   r3   r4   r5   ru   sysmaxsizer:  	transposeminr9   r7   r   r   update_count_replacement_G_test)r?   speciesrA   r$   nonsyn_Gunique_speciesr2   sequencesyn_fix
nonsyn_fixsyn_polynonsyn_polystartsendsstepr   rx   rG  startr}   fixed
all_codonsvaluenonsynsynr   r   r   mktest  sP   





ra  c                 C   sr  d}dd t | j | j D }| j }| jD ]}d||< qt|}i }i }i }i }	t|D ]M\}
}i ||< i |	|< tdD ]<}|D ]7}|d| | ||d d  }|| || krkd|	| |< d|| |< qD||kr{d	|	| |< d|| |< qDq@q0|D ]4}i ||< i ||< |D ]'}||krd|| |< d|| |< qt|	|||| |< t||||| |< qq||fS )
zGet codon codon substitution matrix (PRIVATE).

    Elements in the matrix are number of synonymous and nonsynonymous
    substitutions required for the substitution.
    rl   c                 S   r   r   r   r   r   r   r   rV   S  r  z+_get_codon2codon_matrix.<locals>.<listcomp>r   r   r   r   Nr  )	rs   rv   r   rw   copyr   rr   r9   	_dijkstra)rA   r+   rx   r   r   numr$   rQ  graphgraph_nonsynr   r}   r   r   r   rN   rO   r   r   r   rH  L  sL   


 
rH  c                 C   sl  i }i }|   D ]
}d||< d||< qd||< t|   }t|dkrtd}d}|D ]}|du r6|| }|}q)|| |k rB|| }|}q)|| | |  D ]\}	}
||	 || |
 krh|| |
 ||	< |||	< qN||krnnt|dks#g }|}d}||ks||dkr|d| || }nn||kr~|d| tt|d D ]}|| ||  ||d   7 }q|S )a  Dijkstra's algorithm Python implementation (PRIVATE).

    Algorithm adapted from
    http://thomas.pelletier.im/2010/02/dijkstras-algorithm-python-implementation/.
    However, an obvious bug in::

        if D[child_node] >(<) D[node] + child_value:

    is fixed.
    This function will return the distance between start and end.

    Arguments:
     - graph: Dictionary of dictionary (keys are vertices).
     - start: Start vertex.
     - end: End vertex.

    Output:
       List of vertices from the beginning to the end.

    d   rp   r   Nr   )r   rs   r   remover   r   insertr9   )re  r[  endDr   nodeunseen_nodesshortest	temp_node
child_nodechild_valuepathdistancer   r   r   r   rc  |  sP   


rc  c                    sX   t dkrdS t dkrtt d  d  S  fddD }t|S )z9Count replacement needed for a given codon_set (PRIVATE).r   )r   r   r   r   c                    s"   i | ]   fd dD qS )c                    s"   i | ]}|kr|  | qS r   r   )r   rO   )r$   rN   r   r   r     s   " z1_count_replacement.<locals>.<dictcomp>.<dictcomp>r   )r   r$   rx   )rN   r   r     s    z&_count_replacement.<locals>.<dictcomp>)r   rs   r	   _prim)rx   r$   subgraphr   rt  r   rN    s   rN  c                 C   s\  g }g }|   D ]0}|| | | D ]$}||| | | f|vr7||| | | f|vr7|||| | | f qqtt}|D ]\}}}|| |||f || |||f q?g }	t|d }
||d  dd }t| |rt|\}}}||
vr|
| |	|||f || D ]}|d |
vrt|| q|sqd}|	D ]
}|t	|d 7 }q|S )zPrim's algorithm to find minimum spanning tree (PRIVATE).

    Code is adapted from
    http://programmingpraxis.com/2010/04/09/minimum-spanning-tree-prims-algorithm/
    r   Nr   )
r   ru   r   rs   r   r   r   addr   r	   )r$   nodesedgesr   r   connn1n2cmstusedusable_edgescoster   r   r   r   r   ru    s>   
,

ru  c           
      C   s   d}t | }| d | d  }| d | d  }t | dd }t | dd }|| | || | || | || | g}t| |D ]\}}	||t||	  7 }qAtt|S )zG test for 2x2 contingency table (PRIVATE).

    Arguments:
     - site_counts - [syn_fix, nonsyn_fix, syn_poly, nonsyn_poly]

    >>> print("%0.6f" % _G_test([17, 7, 42, 2]))
    0.004924
    r   r   r   r   N)r   r7   r   r   r   )
site_countsr$   r   tot_syntot_nontot_fixtot_polyexpobsexr   r   r   rO    s   



rO  __main__)run_doctest)r   Nr   Nr   )F)r   N)NN)1__doc__rI  collectionsr   r   heapqr   r   r   	itertoolsr   mathr   r	   r   r   numpyr-  	Bio.Alignr   Bio.Datar   rP   r<   r\   r]   r=   r   r   r>   r   r   r   r   r;   r$  r,  r   r  rF  ra  rH  rc  rN  ru  rO  __name__
Bio._utilsr  r   r   r   r   <module>   sX   
N
;V*"3k
#: j13

%;0D#
