o
    Rŀgd                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 dd	l
mZ dd
lmZ ddlmZ ddlmZ ddlmZ i dddddddddddddddddddddddddddd d!d d"dd#dZdBd%d&Zd'd( ZdCd*d+ZdDd.d/Zd0d1 ZdEd3d4ZdFd5d6Z	8dGd9d:ZdHd<d=ZG d>d? d?eZed@krddAlmZ e  dS dS )Iz3Miscellaneous functions for dealing with sequences.    N)cos)exp)log)pi)sin)	IUPACData)standard_dna_table)
complement)complement_rna)Seq)	translateG      ?CA        TUSWM      ?RYKVgUUUUUU?BHgUUUUUU?DXNremovec                    s   |dvrt d| dt fdddD }|dkr)|t fddd	D  }nt }|d
kr>|t fdddD 7 }|dkrDdS || S )a  Calculate G+C percentage in seq (float between 0 and 1).

    Copes with mixed case sequences. Ambiguous Nucleotides in this context are
    those different from ATCGSWU (S is G or C, and W is A or T).

    If ambiguous equals "remove" (default), will only count GCS and will only
    include ACTGSWU when calculating the sequence length. Equivalent to removing
    all characters in the set BDHKMNRVXY before calculating the GC content, as
    each of these ambiguous nucleotides can either be in (A,T) or in (C,G).

    If ambiguous equals "ignore", it will treat only unambiguous nucleotides (GCS)
    as counting towards the GC percentage, but will include all ambiguous and
    unambiguous nucleotides when calculating the sequence length.

    If ambiguous equals "weighted", will use a "mean" value when counting the
    ambiguous characters, for example, G and C will be counted as 1, N and X will
    be counted as 0.5, D will be counted as 0.33 etc. See Bio.SeqUtils._gc_values
    for a full list.

    Will raise a ValueError for any other value of the ambiguous parameter.


    >>> from Bio.SeqUtils import gc_fraction
    >>> seq = "ACTG"
    >>> print(f"GC content of {seq} : {gc_fraction(seq):.2f}")
    GC content of ACTG : 0.50

    Example with an RNA sequence:

    >>> seq = "GGAUCUUCGGAUCU"
    >>> print(f"GC content of {seq} : {gc_fraction(seq):.2f}")
    GC content of GGAUCUUCGGAUCU : 0.50

    S and W are ambiguous for the purposes of calculating the GC content.

    >>> seq = "ACTGSSSS"
    >>> gc = gc_fraction(seq, "remove")
    >>> print(f"GC content of {seq} : {gc:.2f}")
    GC content of ACTGSSSS : 0.75
    >>> gc = gc_fraction(seq, "ignore")
    >>> print(f"GC content of {seq} : {gc:.2f}")
    GC content of ACTGSSSS : 0.75
    >>> gc = gc_fraction(seq, "weighted")
    >>> print(f"GC content with ambiguous counting: {gc:.2f}")
    GC content with ambiguous counting: 0.75

    Some examples with ambiguous nucleotides.

    >>> seq = "ACTGN"
    >>> gc = gc_fraction(seq, "ignore")
    >>> print(f"GC content of {seq} : {gc:.2f}")
    GC content of ACTGN : 0.40
    >>> gc = gc_fraction(seq, "weighted")
    >>> print(f"GC content with ambiguous counting: {gc:.2f}")
    GC content with ambiguous counting: 0.50
    >>> gc = gc_fraction(seq, "remove")
    >>> print(f"GC content with ambiguous removing: {gc:.2f}")
    GC content with ambiguous removing: 0.50

    Ambiguous nucleotides are also removed from the length of the sequence.

    >>> seq = "GDVV"
    >>> gc = gc_fraction(seq, "ignore")
    >>> print(f"GC content of {seq} : {gc:.2f}")
    GC content of GDVV : 0.25
    >>> gc = gc_fraction(seq, "weighted")
    >>> print(f"GC content with ambiguous counting: {gc:.4f}")
    GC content with ambiguous counting: 0.6667
    >>> gc = gc_fraction(seq, "remove")
    >>> print(f"GC content with ambiguous removing: {gc:.2f}")
    GC content with ambiguous removing: 1.00


    Note that this will return zero for an empty sequence.
    )weightedr!   ignorezambiguous value 'z' not recognizedc                 3       | ]}  |V  qd S Ncount.0xseq I/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqUtils/__init__.py	<genexpr>       zgc_fraction.<locals>.<genexpr>CGScgsr!   c                 3   r$   r%   r&   r(   r+   r-   r.   r/      r0   ATWUatwur"   c                 3   s0    | ]}  |  |  t|  V  qd S r%   )r'   lower
_gc_valuesr(   r+   r-   r.   r/      s     

BDHKMNRVXYr   )
ValueErrorsumlen)r,   	ambiguousgclengthr-   r+   r.   gc_fraction5   s   L
r<   c           
   	   C   sr  i }dD ]}g d||< qt dt| dD ]8}| ||d  }t|dk r)|d7 }t dD ]}dD ]}|| |ksA|| | krK|| |  d7  < q1q-qi }d}d}t dD ]N}z,|d | |d |  |d	 |  |d
 |  }	|d | |d |  d |	 ||< W n ty   d||< Y nw ||d |  |d |  }||	 }qXd| | }||d |d |d fS )a  Calculate G+C content: total, for first, second and third positions.

    Returns a tuple of four floats (percentages between 0 and 100) for the
    entire sequence, and the three codon positions.  e.g.

    >>> from Bio.SeqUtils import GC123
    >>> GC123("ACTGTN")
    (40.0, 50.0, 50.0, 0.0)

    Copes with mixed case sequences, but does NOT deal with ambiguous
    nucleotides.
    )r   r   r   r   )r   r   r   r           r   r   r   r   g      Y@   )ranger8   r3   	Exception)
r,   dnticodonposr:   gcallnallnr-   r-   r.   GC123   s8   0(
rK   d   c              	   C   s   g }t dt| |D ]9}| |||  }|d|d }|d|d }z
|| ||  }W n ty=   d}Y nw || q
|S )a:  Calculate GC skew (G-C)/(G+C) for multiple windows along the sequence.

    Returns a list of ratios (floats), controlled by the length of the sequence
    and the size of the window.

    Returns 0 for windows without any G/C by handling zero division errors.

    Does NOT look at any ambiguous nucleotides.
    r   r   gr   cr   )rA   r8   r'   ZeroDivisionErrorappend)r,   windowvaluesrE   srM   rN   skewr-   r-   r.   GC_skew   s   rU     ,  c                 C   s  ddl }|j|jd}|j|jd}|j|j|jdd}	|	 }
|
d |j|	j	d |j|	j
d |j|j|jd |j|j|jd |	j|j|jd	d
 |	  || || }}|| || || || f\}}}}|}|	j||d| dd | dd t| f d |d7 }|	j||dt| ddd |d7 }|	j||ddd |d7 }|	j||ddd |d7 }|	|||| d}d}t| |D ]~}|}||7 }tdt | t|   }|||  }||t|  }||t|  }||t|  }||t|  }|	j||||dd |d }|| }||t|  }||t|  }||t|  }||t|  }|	j||||dd |	  ||7 }q|	j|	|jd dS )zACalculate and plot normal and accumulated GC skew (GRAPHICS !!!).r   N)orientwhite)yscrollcommandxscrollcommand
background700x700)command)sidefillr?   )r`   r_   expandz%s...%s (%d nt)   i)text   zGC z3.2f%zGC Skewblue)rc   r`   zAccumulated GC Skewmagentar@   )r`   2   )scrollregion)tkinter	ScrollbarVERTICAL
HORIZONTALCanvassetwinfo_toplevelgeometryconfigyviewxviewpackRIGHTr   BOTTOMr   BOTHLEFTupdatecreate_textr8   r<   create_ovalrU   r   r   r   create_line	configurebboxALL)r,   rQ   zoomrpxpyrj   yscrollxscrollcanvaswinX0Y0x1x2y1y2tyaccstartr:   r1alphar2r-   r-   r.   xGC_skew   s`   

$.
r   c           	      C   s   d}|D ]}t j| }t|dkr||7 }q|d| d7 }qd}|g}	 |d7 }| |d }t||}|s:	 |S |t|d7 }|| q%)	zSearch for a DNA subseq in seq, return list of [subseq, positions].

    Use ambiguous values (like N = A or T or C or G, R = A or G etc.),
    searches only on forward strand.
     r?   []TNr   )r   ambiguous_dna_valuesr8   researchintr   rP   )	r,   subseqpatternrD   valuerG   resultrS   mr-   r-   r.   	nt_search  s$   


r   Xaac                    sH   |du rddi}t ttj t|   d fdd| D S )a  Convert protein sequence from one-letter to three-letter code.

    The single required input argument 'seq' should be a protein sequence using
    single letter codes, either as a Python string or as a Seq or MutableSeq
    object.

    This function returns the amino acid sequence as a string using the three
    letter amino acid codes. Output follows the IUPAC standard (including
    ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U
    for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an
    asterisk. Any unknown character (including possible gap characters),
    is changed into 'Xaa' by default.

    e.g.

    >>> from Bio.SeqUtils import seq3
    >>> seq3("MAIVMGRWKGAR*")
    'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer'

    You can set a custom translation of the codon termination code using the
    dictionary "custom_map" argument (which defaults to {'*': 'Ter'}), e.g.

    >>> seq3("MAIVMGRWKGAR*", custom_map={"*": "***"})
    'MetAlaIleValMetGlyArgTrpLysGlyAlaArg***'

    You can also set a custom translation for non-amino acid characters, such
    as '-', using the "undef_code" argument, e.g.

    >>> seq3("MAIVMGRWKGA--R*", undef_code='---')
    'MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer'

    If not given, "undef_code" defaults to "Xaa", e.g.

    >>> seq3("MAIVMGRWKGA--R*")
    'MetAlaIleValMetGlyArgTrpLysGlyAlaXaaXaaArgTer'

    This function was inspired by BioPerl's seq3.
    N*Terr   c                 3   s    | ]	}  |V  qd S r%   )getr)   aa	threecode
undef_coder-   r.   r/   e  s    zseq3.<locals>.<genexpr>)dictlistr   protein_letters_1to3_extendeditemsjoin)r,   
custom_mapr   r-   r   r.   seq35  s   'r   c                    st   |du rddi}dd t j D   dd | D  fdd	ttd
 D }d fdd|D S )a  Convert protein sequence from three-letter to one-letter code.

    The single required input argument 'seq' should be a protein sequence
    using three-letter codes, either as a Python string or as a Seq or
    MutableSeq object.

    This function returns the amino acid sequence as a string using the one
    letter amino acid codes. Output follows the IUPAC standard (including
    ambiguous characters "B" for "Asx", "J" for "Xle", "X" for "Xaa", "U" for
    "Sel", and "O" for "Pyl") plus "*" for a terminator given the "Ter" code.
    Any unknown character (including possible gap characters), is changed
    into '-' by default.

    e.g.

    >>> from Bio.SeqUtils import seq1
    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer")
    'MAIVMGRWKGAR*'

    The input is case insensitive, e.g.

    >>> from Bio.SeqUtils import seq1
    >>> seq1("METalaIlEValMetGLYArgtRplysGlyAlaARGTer")
    'MAIVMGRWKGAR*'

    You can set a custom translation of the codon termination code using the
    dictionary "custom_map" argument (defaulting to {'Ter': '*'}), e.g.

    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla***", custom_map={"***": "*"})
    'MAIVMGRWKGA*'

    You can also set a custom translation for non-amino acid characters, such
    as '-', using the "undef_code" argument, e.g.

    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer", undef_code='?')
    'MAIVMGRWKGA??R*'

    If not given, "undef_code" defaults to "X", e.g.

    >>> seq1("MetAlaIleValMetGlyArgTrpLysGlyAla------ArgTer")
    'MAIVMGRWKGAXXR*'

    Nr   r   c                 S   s   i | ]	\}}|  |qS r-   upperr)   kvr-   r-   r.   
<dictcomp>  s    zseq1.<locals>.<dictcomp>c                 s   s     | ]\}}|  |fV  qd S r%   r   r   r-   r-   r.   r/         zseq1.<locals>.<genexpr>c                    s$   g | ]} d | d |d   qS )r=   r?   r-   )r)   rE   r+   r-   r.   
<listcomp>  s   $ zseq1.<locals>.<listcomp>r=   r   c                 3   s     | ]}  | V  qd S r%   )r   r   r   )onecoder   r-   r.   r/     r   )r   protein_letters_3to1_extendedr   rz   rA   r8   r   )r,   r   r   seqlistr-   )r   r,   r   r.   seq1h  s   ,r   DNAFc              
      sz  z| j } W n	 ty   Y nw dt|   } |dkr(|r$tj n'tj n#|dkr6|r2tj	 ntj
 n|dkrD|r@tj ntj ntd||rPd}nd}zt fdd	| D t| d
 |  }|rl||8 }W n ty } ztd| d| dd}~ww |r|dkrtd|dkrt| } n|dkrt| } |t fdd	| D t| d
 |  7 }|r||8 }|S )a  Calculate the molecular mass of DNA, RNA or protein sequences as float.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

    Arguments:
     - seq: string, Seq, or SeqRecord object.
     - seq_type: The default is to assume DNA; override this with a string
       "DNA", "RNA", or "protein".
     - double_stranded: Calculate the mass for the double stranded molecule?
     - circular: Is the molecule circular (has no ends)?
     - monoisotopic: Use the monoisotopic mass tables?

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    r   r   RNAprotein/Allowed seq_types are DNA, RNA or protein, not gt{Ic2@gg2@c                 3       | ]} | V  qd S r%   r-   r(   weight_tabler-   r.   r/         z#molecular_weight.<locals>.<genexpr>r?   'z(' is not a valid unambiguous letter for Nz+protein sequences cannot be double-strandedc                 3   r   r%   r-   r(   r   r-   r.   r/     r   )r,   AttributeErrorr   strsplitr   r   $monoisotopic_unambiguous_dna_weightsunambiguous_dna_weights$monoisotopic_unambiguous_rna_weightsunambiguous_rna_weightsmonoisotopic_protein_weightsprotein_weightsr6   r7   r8   KeyErrorr	   r
   )r,   seq_typedouble_strandedcircularmonoisotopicwaterweighter-   r   r.   molecular_weight  s\   
&
*r   r?   c              	   C   s  ddl m} ddl m} d|  v r|| }n|| }|ddd }t| }i }tdD ].}d|| d  }	t| |||	  |||d < t||||	  |ddd ||d  < q,|d	kro| dd
  d| dd  }
n| }
d}dD ]}|d|| | f 7 }qudt	| dd }|d|
 ||f 7 }|}td|dD ]}| ||d  }|||d  }|d }|d|d |d d f 7 }|dd
|d ||d	   d 7 }|dd
|d ||d	   d 7 }|d
|d ||d	  d 7 }|| dt|  7 }|| d 7 }|d
|d ||d	  d 7 }|dd
|d ||d	   d 7 }|dd
|d ||d	   d 7 }q|S )a  Return pretty string showing the 6 frame translations and GC content.

    Nice looking 6 frame translation with GC content - code from xbbtools
    similar to DNA Striders six-frame translation

    >>> from Bio.SeqUtils import six_frame_translations
    >>> print(six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA"))
    GC_Frame: a:5 t:0 g:8 c:5
    Sequence: auggccauug ... gggccgcuga, 24 nt, 54.17 %GC
    <BLANKLINE>
    <BLANKLINE>
    1/1
      G  H  C  N  G  P  L
     W  P  L  *  W  A  A
    M  A  I  V  M  G  R  *
    auggccauuguaaugggccgcuga   54 %
    uaccgguaacauuacccggcgacu
    A  M  T  I  P  R  Q
     H  G  N  Y  H  A  A  S
      P  W  Q  L  P  G  S
    <BLANKLINE>
    <BLANKLINE>

    r   )reverse_complement)reverse_complement_rnauNr   r=   r?   rd   
   z ... iz	GC_Frame:)atrM   rN   z %s:%drL   r#   )r9   z#
Sequence: %s, %d nt, %0.2f %%GC


<   z%d/%d
r>   
 r@   z%5d %%
z

)Bio.Seqr   r   r3   r8   rA   r   r'   r   r<   r   r   )r,   genetic_coder   r   anticompr;   framesrE   fragment_lengthshortheaderrD   r:   resr   csubseqpr-   r-   r.   six_frame_translations  sN   
, &&""&(r   c                   @   s6   e Zd ZdZefddZdd Zddd	Zd
d ZdS )CodonAdaptationIndexzA codon adaptation index (CAI) implementation.

    Implements the codon adaptation index (CAI) described by Sharp and
    Li (Nucleic Acids Res. 1987 Feb 11;15(3):1281-95).
    c              
      sz  || _ dd |jD }|j D ]\}}|| | qtt| |jg }dd dD  | 	  |D ]V}z|j
}|j}W n tyK   d}Y nw | }tdt|dD ]3}	||	|	d  }z
 |  d7  < W qX ty   |du r~d	| d
}
nd	| d| }
t|
dw q6  D ]\}}|dkrd |< q|D ]}t fdd|D }|D ]
} | | | |< qqdS )a  Generate a codon adaptiveness table from the coding DNA sequences.

        This calculates the relative adaptiveness of each codon (w_ij) as
        defined by Sharp & Li (Nucleic Acids Research 15(3): 1281-1295 (1987))
        from the provided codon DNA sequences.

        Arguments:
         - sequences: An iterable over DNA sequences, which may be plain
                      strings, Seq objects, MutableSeq objects, or SeqRecord
                      objects.
         - table:     A Bio.Data.CodonTable.CodonTable object defining the
                      genetic code. By default, the standard genetic code is
                      used.
        c                 S   s   i | ]}|g qS r-   r-   )r)   	aminoacidr-   r-   r.   r   Y  s    z1CodonAdaptationIndex.__init__.<locals>.<dictcomp>c                 S   s.   i | ]}d D ]}d D ]	}|| | dq
qqS )ACGTr   r-   )r)   c1c2c3r-   r-   r.   r   _  s   . r   Nr   r=   r?   zillegal codon 'r   z
' in gene r   c                 3   r   r%   r-   )r)   rF   countsr-   r.   r/   |  r   z0CodonAdaptationIndex.__init__.<locals>.<genexpr>)_tableprotein_alphabetforward_tabler   rP   tupler   rR   stop_codonsrz   idr,   r   r   rA   r8   r   r6   max)self	sequencestablecodonsrF   r   synonymous_codonssequencenamerE   messager'   denominatorr-   r   r.   __init__I  sH   


zCodonAdaptationIndex.__init__c              	   C   s   d\}}z|j }W n	 ty   Y nw | }tdt|dD ]3}|||d  }|dv r.qz
|t| | 7 }W n tyM   |dv rEY qtd| dw |d7 }qt|| S )	zCCalculate and return the CAI (float) for the provided DNA sequence.)r   r   r   r=   )ATGTGG)TGATAATAGzillegal codon in sequence: Nr?   )	r,   r   r   rA   r8   r   r   	TypeErrorr   )r  r  	cai_value
cai_lengthrE   rF   r-   r-   r.   	calculate  s(   

zCodonAdaptationIndex.calculater   Tc              
      s   z|j }W n	 ty   Y nw | }i  | jj D ]"\}}| | dkr=| v r9 |  d| d}|r9t|| |< q| jjD ]}| | dkrN| d< qB|dksW|dkr\t|}n|dkrc|}ntd|zd		 fd
d|D }	W t|	S  t
y }
 zt
d|
 dd}
~
ww )a  Return a new DNA sequence with preferred codons only.

        Uses the codon adaptiveness table defined by the CodonAdaptationIndex
        object to generate DNA sequences with only preferred codons.
        May be useful when designing DNA sequences for transgenic protein
        expression or codon-optimized proteins like fluorophores.

        Arguments:
            - sequence: DNA, RNA, or protein sequence to codon-optimize.
                        Supplied as a str, Seq, or SeqRecord object.
            - seq_type: String specifying type of sequence provided.
                        Options are "DNA", "RNA", and "protein". Default is "DNA".
            - strict:   Determines whether an exception should be raised when
                        two codons are equally preferred for a given amino acid.
        Returns:
            Seq object with DNA encoding the same protein as the sequence argument,
            but using only preferred codons as defined by the codon adaptation index.
            If multiple codons are equally preferred, a warning is issued
            and one codon is chosen for use in the optimized sequence.
        r   z and z are equally preferred.r   r   r   r   r   r   c                 3   r   r%   r-   r   pref_codonsr-   r.   r/     r   z0CodonAdaptationIndex.optimize.<locals>.<genexpr>zUnrecognized amino acid: N)r,   r   r   r   r   r   r6   r   r   r   r   r   )r  r  r   strictr,   rF   r   msgaa_seq	optimizedexr-   r  r.   optimize  sB   

zCodonAdaptationIndex.optimizec                 C   s>   g }|   D ]\}}| d|d}|| qd|d S )N	z.3fr   )r   rP   r   )r  linesrF   r   liner-   r-   r.   __str__  s
   zCodonAdaptationIndex.__str__N)r   T)	__name__
__module____qualname____doc__r   r
  r  r  r  r-   r-   r-   r.   r   B  s    7
6r   __main__)run_doctest)r!   )rL   )rV   rL   rW   rL   rL   )Nr   )Nr   )r   FFF)r?   ) r#  r   mathr   r   r   r   r   Bio.Datar   Bio.Data.CodonTabler   r   r	   r
   r   r   r4   r<   rK   rU   r   r   r   r   r   r   r   r   r   
Bio._utilsr%  r-   r-   r-   r.   <module>   s   
	

a
*
=
 
3=

SK 
