o
    RŀgI4                     @   sx   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlm	Z	 G dd	 d	Z
ed
kr:ddlmZ e  dS dS )a  Simple protein analysis.

Examples
--------
>>> from Bio.SeqUtils.ProtParam import ProteinAnalysis
>>> X = ProteinAnalysis("MAEGEITTFTALTEKFNLPPGNYKKPKLLYCSNGGHFLRILPDGTVDGT"
...                     "RDRSDQHIQLQLSAESVGEVYIKSTETGQYLAMDTSGLLYGSQTPSEEC"
...                     "LFLERLEENHYNTYTSKKHAEKNWFVGLKKNGSCKRGPRTHYGQKAILF"
...                     "LPLPV")
>>> print(X.count_amino_acids()['A'])
6
>>> print(X.count_amino_acids()['E'])
12
>>> print("%0.2f" % X.get_amino_acids_percent()['A'])
0.04
>>> print("%0.2f" % X.get_amino_acids_percent()['L'])
0.12
>>> print("%0.2f" % X.molecular_weight())
17103.16
>>> print("%0.2f" % X.aromaticity())
0.10
>>> print("%0.2f" % X.instability_index())
41.98
>>> print("%0.2f" % X.isoelectric_point())
7.72
>>> sec_struc = X.secondary_structure_fraction()  # [helix, turn, sheet]
>>> print("%0.2f" % sec_struc[0])  # helix
0.33
>>> print("%0.2f" % sec_struc[1])  # turn
0.29
>>> print("%0.2f" % sec_struc[2])  # sheet
0.37
>>> epsilon_prot = X.molar_extinction_coefficient()  # [reduced, oxidized]
>>> print(epsilon_prot[0])  # with reduced cysteines
17420
>>> print(epsilon_prot[1])  # with disulfid bridges
17545

Other public methods are:
 - gravy
 - protein_scale
 - flexibility
 - charge_at_pH

    N)	IUPACData)Seq)IsoelectricPoint)molecular_weight)ProtParamDatac                   @   s   e Zd ZdZd"ddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
d#ddZdd Zd$ddZdd Zdd Zdd Zdd  Zd!S )%ProteinAnalysisaP  Class containing methods for protein analysis.

    The constructor takes two arguments.
    The first is the protein sequence as a string or a Seq object.

    The second argument is optional. If set to True, the weight of the amino
    acids will be calculated using their monoisotopic mass (the weight of the
    most abundant isotopes for each element), instead of the average molecular
    mass (the averaged weight of all stable isotopes for each element).
    If set to false (the default value) or left out, the IUPAC average
    molecular mass will be used for the calculation.

    Fc                 C   s,   |  | _d| _d| _t| j| _|| _dS )zInitialize the class.N)uppersequenceamino_acids_contentamino_acids_percentlenlengthmonoisotopic)selfprot_sequencer    r   J/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqUtils/ProtParam.py__init__L   s
   

zProteinAnalysis.__init__c                 C   s@   | j du rdd tjD }|D ]
}| j|||< q|| _ | j S )a(  Count standard amino acids, return a dict.

        Counts the number times each amino acid is in the protein
        sequence. Returns a dictionary {AminoAcid:Number}.

        The return value is cached in self.amino_acids_content.
        It is not recalculated upon subsequent calls.
        Nc                 S   s   i | ]}|d qS )r   r   ).0kr   r   r   
<dictcomp>^   s    z5ProteinAnalysis.count_amino_acids.<locals>.<dictcomp>)r
   r   protein_lettersr	   count)r   prot_dicaar   r   r   count_amino_acidsT   s   
	z!ProteinAnalysis.count_amino_acidsc                    s4    j du r  } fdd| D }| _  j S )a  Calculate the amino acid content in percentages.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary of {AminoAcid:percentage}.

        The return value is cached in self.amino_acids_percent.

        input is the dictionary self.amino_acids_content.
        output is a dictionary with amino acids as keys.
        Nc                    s   i | ]
\}}|| j  qS r   )r   )r   r   r   r   r   r   r   t   s    z;ProteinAnalysis.get_amino_acids_percent.<locals>.<dictcomp>)r   r   items)r   	aa_countspercentagesr   r   r   get_amino_acids_percentf   s
   
z'ProteinAnalysis.get_amino_acids_percentc                 C   s   t | jd| jdS )z#Calculate MW from Protein sequence.protein)seq_typer   )r   r	   r   r   r   r   r   r   z   s   
z ProteinAnalysis.molecular_weightc                    s&   d}|    t fdd|D }|S )zCalculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        YWFc                 3       | ]} | V  qd S Nr   r   r   aa_percentagesr   r   	<genexpr>       z.ProteinAnalysis.aromaticity.<locals>.<genexpr>r    sum)r   aromatic_aasaromaticityr   r'   r   r.      s   zProteinAnalysis.aromaticityc                 C   sV   t j}d}t| jd D ]}| j||d  \}}|| | }||7 }qd| j | S )ak  Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life).

        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
                      g      $@)r   DIWVranger   r	   )r   indexscoreithisnextdipeptide_valuer   r   r   instability_index   s   

z!ProteinAnalysis.instability_indexc                 C   s   t j}d}g d}g }t| j| D ]E}| j|||  }d}t|d D ]}|| }	||| d  }
|||	 ||
  ||  7 }q%||d d  }||| 7 }||d  q|S )zCalculate the flexibility according to Vihinen, 1994.

        No argument to change window size because parameters are specific for
        a window=9. The parameters used are optimized for determining the
        flexibility.
        	   )g      ?g      ?g      ?g      ?r0   r/   r1   r0   g      @)r   Flexr3   r   r	   append)r   flexibilitieswindow_sizeweightsscoresr6   subsequencer5   jfrontbackmiddler   r   r   flexibility   s   zProteinAnalysis.flexibilityKyteDoolitlec                    sH   t j|d  dkrtd| dt fdd| jD }|| j S )aB  Calculate the GRAVY (Grand Average of Hydropathy) according to Kyte and Doolitle, 1982.

        Utilizes the given Hydrophobicity scale, by default uses the original
        proposed by Kyte and Doolittle (KyteDoolitle). Other options are:
        Aboderin, AbrahamLeo, Argos, BlackMould, BullBreese, Casari, Cid,
        Cowan3.4, Cowan7.5, Eisenberg, Engelman, Fasman, Fauchere, GoldSack,
        Guy, Jones, Juretic, Kidera, Miyazawa, Parker,Ponnuswamy, Rose,
        Roseman, Sweet, Tanford, Wilson and Zimmerman.

        New scales can be added in ProtParamData.
        zscale: z
 not knownc                 3   r$   r%   r   r&   selected_scaler   r   r)      r*   z(ProteinAnalysis.gravy.<locals>.<genexpr>)r   gravy_scalesget
ValueErrorr,   r	   r   )r   scaletotal_gravyr   rJ   r   gravy   s
   
zProteinAnalysis.gravyc                 C   sH   dd|  |d  }dg|d  }t |d D ]
}|||  ||< q|S )a1  Make list of relative weight of window edges (PRIVATE).

        The relative weight of window edges are compared to the window
        center. The weights are linear. It actually generates half a list.
        For a window of size 9 and edge 0.4 you get a list of
        [0.4, 0.55, 0.7, 0.85].
        r1         ?r0   r/   )r3   )r   windowedgeunitr@   r6   r   r   r   _weight_list   s
   zProteinAnalysis._weight_listrR   c                 C   s  |  ||}g }t|d d }t| j| d D ]s}| j|||  }d}	t|d D ]=}
z |||
  }||||
 d   }|	||
 | ||
 |  7 }	W q, tyi   tjd||
 |||
 d  f  Y q,w ||d  }||v r{|	|| 7 }	n
tjd| d |	|	|  q|S )a  Compute a profile by any amino acid scale.

        An amino acid scale is defined by a numerical value assigned to each
        type of amino acid. The most frequently used scales are the
        hydrophobicity or hydrophilicity scales and the secondary structure
        conformational parameters scales, but many other scales exist which
        are based on different chemical and physical properties of the
        amino acids.  You can set several parameters that control the
        computation of a scale profile, such as the window size and the window
        edge relative weight value.

        WindowSize: The window size is the length of the interval to use for
        the profile computation. For a window size n, we use the i-(n-1)/2
        neighboring residues on each side to compute the score for residue i.
        The score for residue i is the sum of the scaled values for these
        amino acids, optionally weighted according to their position in the
        window.

        Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the
        same weight, but you can make the residue at the center of the window
        have a larger weight than the others by setting the edge value for the
        residues at the beginning and end of the interval to a value between
        0 and 1. For instance, for Edge=0.4 and a window size of 5 the weights
        will be: 0.4, 0.7, 1.0, 0.7, 0.4.

        The method returns a list of values which can be plotted to view the
        change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale:
        http://www.expasy.org/cgi-bin/protscale.pl
        r1   r0   r/   z0warning: %s or %s is not a standard amino acid.
z	warning: z is not a standard amino acid.
)
rV   r,   r3   r   r	   KeyErrorsysstderrwriter=   )r   
param_dictrS   rT   r@   rA   sum_of_weightsr6   rB   r5   rC   rD   rE   rF   r   r   r   protein_scale   s0   & zProteinAnalysis.protein_scalec                 C   s   |   }t| j|}| S )zuCalculate the isoelectric point.

        Uses the module IsoelectricPoint to calculate the pI of a protein.
        )r   r   r	   pi)r   
aa_contentie_pointr   r   r   isoelectric_point,  s   z!ProteinAnalysis.isoelectric_pointc                 C   s    |   }t| j|}||S )z.Calculate the charge of a protein at given pH.)r   r   r	   charge_at_pH)r   pHr_   charger   r   r   rb   6  s   
zProteinAnalysis.charge_at_pHc                    sT   |    t fdddD }t fdddD }t fdddD }|||fS )ak  Calculate fraction of helix, turn and sheet.

        Returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet, according to Haimov and Srebnik, 2016;
        Hutchinson and Thornton, 1994; and Kim and Berg, 1993, respectively.

        Amino acids in helix: E, M, A, L, K.
        Amino acids in turn: N, P, G, S, D.
        Amino acids in sheet: V, I, Y, F, W, L, T.

        Note that, prior to v1.82, this method wrongly returned
        (Sheet, Turn, Helix) while claiming to return (Helix, Turn, Sheet).

        Returns a tuple of three floats (Helix, Turn, Sheet).
        c                 3   r$   r%   r   r   rr'   r   r   r)   N  r*   z?ProteinAnalysis.secondary_structure_fraction.<locals>.<genexpr>EMALKc                 3   r$   r%   r   re   r'   r   r   r)   O  r*   NPGSDc                 3   r$   r%   r   re   r'   r   r   r)   P  r*   VIYFWLTr+   )r   helixturnsheetr   r'   r   secondary_structure_fraction<  s
   
z,ProteinAnalysis.secondary_structure_fractionc                 C   s<   |   }|d d |d d  }||d d d  }||fS )zCalculate the molar extinction coefficient.

        Calculates the molar extinction coefficient assuming cysteines
        (reduced) and cystines residues (Cys-Cys-bond)
        Wi|  Yi  Cr1   }   )r   )r   num_aamec_reducedmec_cystinesr   r   r   molar_extinction_coefficientT  s   z,ProteinAnalysis.molar_extinction_coefficientN)F)rH   )rR   )__name__
__module____qualname____doc__r   r   r    r   r.   r:   rG   rQ   rV   r]   ra   rb   rm   ru   r   r   r   r   r   =   s     


J
r   __main__)run_doctest)ry   rX   Bio.Datar   Bio.Seqr   Bio.SeqUtilsr   r   r   r   rv   
Bio._utilsr{   r   r   r   r   <module>   s   .  %
