o
    Rŀg                      @   s4  d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 d+d	d
ZG dd dZG dd dZG dd deZG dd dZG dd dZG dd dZG dd dZG dd dZG dd dZG dd dZG dd  d eeeZG d!d" d"eeeZG d#d$ d$eZG d%d& d&eZd,d'd(Zd,d)d*ZdS )-a  Code to work with the BLAST XML output.

The BLAST XML DTD file is available on the NCBI site at:
https://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd

Record classes to hold BLAST output are:

Classes:
Blast              Holds all the information from a blast search.
PSIBlast           Holds all the information from a psi-blast search.

Header             Holds information from the header.
Description        Holds information about one hit description.
Alignment          Holds information about one alignment hit.
HSP                Holds information about one HSP.
MultipleAlignment  Holds information about a multiple alignment.
DatabaseReport     Holds information from the database report.
Parameters         Holds information from the parameters.

    N)ContentHandler)MultipleSeqAlignment)Seq)	SeqRecord%s	<unknown>c                 C   s   | du r|S ||  S )z5Ensure the given value formats to a string correctly.N )valueformat_specdefault_strr   r   E/var/www/html/myenv/lib/python3.10/site-packages/Bio/Blast/NCBIXML.pyfmt_(   s   r   c                   @      e Zd ZdZdd ZdS )HeaderaI  Saves information from a blast header.

    Members:
    application         The name of the BLAST flavor that generated this data.
    version             Version of blast used.
    date                Date this data was generated.
    reference           Reference for blast.

    query               Name of query sequence.
    query_letters       Number of letters in the query sequence.  (int)

    database            Name of the database.
    database_sequences  Number of sequences in the database.  (int)
    database_letters    Number of letters in the database.  (int)

    c                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _dS Initialize the class. N)	applicationversiondate	referencequeryquery_lettersdatabasedatabase_sequencesdatabase_lettersselfr   r   r   __init__A   s   
zHeader.__init__N__name__
__module____qualname____doc__r   r   r   r   r   r   /   s    r   c                   @       e Zd ZdZdd Zdd ZdS )Descriptiona4  Stores information about one hit in the descriptions section.

    Members:
    title           Title of the hit.
    score           Number of bits.  (int)
    bits            Bit score. (float)
    e               E value.  (float)
    num_alignments  Number of alignments for the same subject.  (int)
    c                 C   s"   d| _ d| _d| _d| _d| _dS r   )titlescorebitsenum_alignmentsr   r   r   r   r   [   
   
zDescription.__init__c                 C   s   | j dd| jdd| j S )z#Return the description as a string.z<66 z>5z  )r&   r'   r)   r   r   r   r   __str__c   s   zDescription.__str__Nr    r!   r"   r#   r   r-   r   r   r   r   r%   P   s    
r%   c                       s(   e Zd ZdZ fddZdd Z  ZS )DescriptionExtzuExtended description record for BLASTXML version 2.

    Members:
    items           List of DescriptionExtItem
    c                    s   t    g | _dS r   N)superr   itemsr   	__class__r   r   r   o   s   

zDescriptionExt.__init__c                 C   s(   t | jdkrt|| _| j| dS )z"Add a description extended record.r   N)lenr2   strr&   append)r   itemr   r   r   append_itemu   s   
zDescriptionExt.append_item)r    r!   r"   r#   r   r9   __classcell__r   r   r3   r   r/   h   s    r/   c                   @   r$   )DescriptionExtItemzStores information about one record in hit description for BLASTXML version 2.

    Members:
    id              Database identifier
    title           Title of the hit.
    c                 C   s"   d| _ d| _d| _d| _d| _dS r0   )idr&   	accessiontaxidscinamer   r   r   r   r      r+   zDescriptionExtItem.__init__c                 C   s   | j  d| j S )z8Return the description identifier and title as a string.r,   )r<   r&   r   r   r   r   r-         zDescriptionExtItem.__str__Nr.   r   r   r   r   r;   |   s    r;   c                   @   r$   )	AlignmentzStores information about one hit in the alignments section.

    Members:
    title      Name.
    hit_id     Hit identifier. (str)
    hit_def    Hit definition. (str)
    length     Length.  (int)
    hsps       A list of HSP objects.

    c                 C   s"   d| _ d| _d| _d| _g | _dS r   )r&   hit_idhit_deflengthhspsr   r   r   r   r      r+   zAlignment.__init__c                 C   s*   | j d}|d| j d d|S )z1Return the BLAST alignment as a formatted string.
z	Length = z
           )r&   splitr7   rD   joinr   linesr   r   r   r-      s   
zAlignment.__str__Nr.   r   r   r   r   rA      s    rA   c                   @   r$   )HSPa	  Stores information about one hsp in an alignment hit.

    Members:
        - score           BLAST score of hit.  (float)
        - bits            Number of bits for that score.  (float)
        - expect          Expect value.  (float)
        - num_alignments  Number of alignments for same subject.  (int)
        - identities      Number of identities (int) if using the XML parser.
          Tuple of number of identities/total aligned (int, int)
          if using the (obsolete) plain text parser.
        - positives       Number of positives (int) if using the XML parser.
          Tuple of number of positives/total aligned (int, int)
          if using the (obsolete) plain text parser.
        - gaps            Number of gaps (int) if using the XML parser.
          Tuple of number of gaps/total aligned (int, int) if
          using the (obsolete) plain text parser.
        - align_length    Length of the alignment. (int)
        - strand          Tuple of (query, target) strand.
        - frame           Tuple of 1 or 2 frame shifts, depending on the flavor.

        - query           The query sequence.
        - query_start     The start residue for the query sequence.  (1-based)
        - query_end       The end residue for the query sequence.  (1-based)
        - match           The match sequence.
        - sbjct           The sbjct sequence.
        - sbjct_start     The start residue for the sbjct sequence.  (1-based)
        - sbjct_end       The end residue for the sbjct sequence.  (1-based)

    Not all flavors of BLAST return values for every attribute::

                  score     expect     identities   positives    strand  frame
        BLASTP     X          X            X            X
        BLASTN     X          X            X            X          X
        BLASTX     X          X            X            X                  X
        TBLASTN    X          X            X            X                  X
        TBLASTX    X          X            X            X                 X/X

    Note: for BLASTX, the query sequence is shown as a protein sequence,
    but the numbering is based on the nucleotides.  Thus, the numbering
    is 3x larger than the number of amino acid residues.  A similar effect
    can be seen for the sbjct sequence in TBLASTN, and for both sequences
    in TBLASTX.

    Also, for negative frames, the sequence numbering starts from
    query_start and counts down.

    c                 C   sj   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d| _d| _d| _d| _dS )r   NNNr   r   )r'   r(   expectr*   
identities	positivesgapsalign_lengthstrandframer   query_start	query_endmatchsbjctsbjct_start	sbjct_endr   r   r   r   r      s"   
zHSP.__init__c                 C   s(  dt | jdt | jdt | jdt | jdf g}| jdu r#d|S | jdk rL|d| j| j| j	f  |d| j
  |d	| j| j| jf  nC|d
| j| jdd | jdd | j	f  |d| j
dd  d| j
dd   |d| j| jdd | jdd | jf  d|S )z+Return the BLAST HSP as a formatted string.z7Score %s (%s bits), expectation %s, alignment length %sz%iz%0.1eNrF   2   zQuery:%8s %s %sz               zSbjct:%8s %s %szQuery:%8s %s...%s %s-   z...zSbjct:%8s %s...%s %s)r   r'   r(   rM   rQ   rH   r7   rT   r   rU   rV   rX   rW   rY   rI   r   r   r   r-      s<   




	

"*"
zHSP.__str__Nr.   r   r   r   r   rK      s    0rK   c                   @   r$   )MultipleAlignmenta  Holds information about a multiple alignment.

    Members:
    alignment  A list of tuples (name, start residue, sequence, end residue).

    The start residue is 1-based.  It may be blank, if that sequence is
    not aligned in the multiple alignment.

    c                 C   s
   g | _ dS r0   )	alignmentr   r   r   r   r     s   
zMultipleAlignment.__init__c           
      C   s   g }g }d}d}| j D ]+\}}}}|dkr|d7 }d}|dkr*|| || q||  |7  < |d7 }qdd t||D }	t|	S )a  Retrieve generic alignment object for the given alignment.

        Instead of the tuples, this returns a MultipleSeqAlignment object
        from Bio.Align, through which you can manipulate and query
        the object.

        Thanks to James Casbon for the code.
        r   QUERY   c                 s   s"    | ]\}}t t||V  qd S N)r   r   ).0nameseqr   r   r   	<genexpr><  s    
z/MultipleAlignment.to_generic.<locals>.<genexpr>)r^   r7   zipr   )
r   	seq_parts	seq_namesparse_numbernrc   startrd   endrecordsr   r   r   
to_generic#  s"   	

zMultipleAlignment.to_genericN)r    r!   r"   r#   r   rn   r   r   r   r   r]     s    
r]   c                   @   r   )Roundak  Holds information from a PSI-BLAST round.

    Members:
    number       Round number.  (int)
    reused_seqs  Sequences in model, found again.  List of Description objects.
    new_seqs     Sequences not found, or below threshold.  List of Description.
    alignments          A list of Alignment objects.
    multiple_alignment  A MultipleAlignment object.
    c                 C   s"   d| _ g | _g | _g | _d| _dS r0   )numberreused_seqsnew_seqs
alignmentsmultiple_alignmentr   r   r   r   r   M  r+   zRound.__init__Nr   r   r   r   r   ro   B      
ro   c                   @   r   )DatabaseReporta@  Holds information about a database report.

    Members:
    database_name              List of database names.  (can have multiple dbs)
    num_letters_in_database    Number of letters in the database.  (int)
    num_sequences_in_database  List of number of sequences in the database.
    posted_date                List of the dates the databases were posted.
    ka_params                  A tuple of (lambda, k, h) values.  (floats)
    gapped                     # XXX this isn't set right!
    ka_params_gap              A tuple of (lambda, k, h) values.  (floats)

    c                 C   s.   g | _ g | _g | _g | _d| _d| _d| _dS )r   )NNNr   N)database_nameposted_datenum_letters_in_databasenum_sequences_in_database	ka_paramsgappedka_params_gapr   r   r   r   r   d  s   
zDatabaseReport.__init__Nr   r   r   r   r   rv   V  s    rv   c                   @   r   )
Parametersa  Holds information about the parameters.

    Members:
    matrix              Name of the matrix.
    gap_penalties       Tuple of (open, extend) penalties.  (floats)
    sc_match            Match score for nucleotide-nucleotide comparison
    sc_mismatch         Mismatch penalty for nucleotide-nucleotide comparison
    num_hits            Number of hits to the database.  (int)
    num_sequences       Number of sequences.  (int)
    num_good_extends    Number of extensions.  (int)
    num_seqs_better_e   Number of sequences better than e-value.  (int)
    hsps_no_gap         Number of HSP's better, without gapping.  (int)
    hsps_prelim_gapped  Number of HSP's gapped in prelim test.  (int)
    hsps_prelim_gapped_attemped  Number of HSP's attempted in prelim.  (int)
    hsps_gapped         Total number of HSP's gapped.  (int)
    query_length        Length of the query.  (int)
    query_id            Identifier of the query sequence. (str)
    database_length     Number of letters in the database.  (int)
    effective_hsp_length         Effective HSP length.  (int)
    effective_query_length       Effective length of query.  (int)
    effective_database_length    Effective length of database.  (int)
    effective_search_space       Effective search space.  (int)
    effective_search_space_used  Effective search space used.  (int)
    frameshift          Frameshift window.  Tuple of (int, float)
    threshold           Threshold.  (int)
    window_size         Window size.  (int)
    dropoff_1st_pass    Tuple of (score, bits).  (int, float)
    gap_x_dropoff       Tuple of (score, bits).  (int, float)
    gap_x_dropoff_final Tuple of (score, bits).  (int, float)
    gap_trigger         Tuple of (score, bits).  (int, float)
    blast_cutoff        Tuple of (score, bits).  (int, float)
    c                 C   s   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d| _
d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _d| _dS )r   r   rL   N)matrixgap_penaltiessc_matchsc_mismatchnum_hitsnum_sequencesnum_good_extendsnum_seqs_better_ehsps_no_gaphsps_prelim_gappedhsps_prelim_gapped_attempedhsps_gappedquery_idquery_lengthdatabase_lengtheffective_hsp_lengtheffective_query_lengtheffective_database_lengtheffective_search_spaceeffective_search_space_used
frameshift	thresholdwindow_sizedropoff_1st_passgap_x_dropoffgap_x_dropoff_finalgap_triggerblast_cutoffr   r   r   r   r     s8   
zParameters.__init__Nr   r   r   r   r   r~   o  s    !r~   c                   @   r   )Blasta  Saves the results from a blast search.

    Members:
    descriptions        A list of Description objects.
    alignments          A list of Alignment objects.
    multiple_alignment  A MultipleAlignment object.
    + members inherited from base classes

    c                 C   s4   t |  t|  t|  g | _g | _d| _dS r0   )r   r   rv   r~   descriptionsrs   rt   r   r   r   r   r     s   



zBlast.__init__Nr   r   r   r   r   r     ru   r   c                   @   r   )PSIBlastzSaves the results from a blastpgp search.

    Members:
    rounds       A list of Round objects.
    converged    Whether the search converged.
    + members inherited from base classes

    c                 C   s.   t |  t|  t|  g | _d| _dS )r   r   N)r   r   rv   r~   rounds	convergedr   r   r   r   r     s
   



zPSIBlast.__init__Nr   r   r   r   r   r     s    	r   c                   @   s:   e Zd ZdZdddZdd Zdd Zd	d
 Zdd ZdS )
_XMLparserzGeneric SAX Parser (PRIVATE).

    Just a very basic SAX parser.

    Redefine the methods startElement, characters and endElement.
    r   c                 C   s(   g | _ d| _|| _g | _d| _d| _dS )uInitialize the parser.

        Arguments:
         - debug - integer, amount of debug information to print

        r   r`   N)_tag_value_debug_debug_ignore_list_method_name_level_method_mapr   debugr   r   r   r     s   
z_XMLparser.__init__c                 C   s   | j | t| j dkr| | dS d| | }|| jv r2| j|   | jdkr1td|  n| jdkrH|| jvrHtd|  | j| | j	
 rYtd| j	 d	|d
d| _	dS )zFound XML start tag.

        No real need of attr, BLAST DTD doesn't use them

        Arguments:
         - name -- name of the tag
         - attr -- tag attributes

        r`   Nstart_   NCBIXML: Parsed:     NCBIXML: Ignored: zWhat should we do with z before the z tag?r   )r   r7   r5   _on_root_node_node_method_namer   r   printr   r   strip
ValueError)r   rc   attrmethodr   r   r   startElement  s&   







z_XMLparser.startElementc                 C   s   |  j |7  _ dS )zOFound some text.

        Arguments:
         - ch -- characters read

        N)r   )r   chr   r   r   
characters  s   z_XMLparser.charactersc                 C   s   d|  | }|| jv r#| j|   | jdkr"td| d| j  n| jdkr>|| jvr>td| d| j  | j| d| _| j  dS )	zLFound XML end tag.

        Arguments:
         - name -- tag name

        end_   r   r,   r`   r   r   N)	r   r   r   r   r   r   r7   r   pop)r   rc   r   r   r   r   
endElement!  s   




z_XMLparser.endElementc                 C   s&   | j dkr|S d| j| j  d  S )Nr`   /)r   rH   r   r   rc   r   r   r   r   >  s   
z_XMLparser._node_method_nameNr   )	r    r!   r"   r#   r   r   r   r   r   r   r   r   r   r     s    
(	r   c                   @   s  e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Zd!d" Zd#d$ Zd%d& Zd'd( Zd)d* Zd+d, Zd-d. Zd/d0 Zd1d2 Zd3d4 Zd5d6 Zd7d8 Zd9d: Zd;d< Z d=d> Z!d?d@ Z"dAdB Z#dCdD Z$dEdF Z%dGdH Z&dIdJ Z'dKdL Z(dMdN Z)dOdP Z*dQdR Z+dSdT Z,dUdV Z-dWdX Z.dYdZ Z/d[d\ Z0d]d^ Z1d_d` Z2dadb Z3dcdd Z4dedf Z5dgdh Z6didj Z7dkdl Z8dmdn Z9dodp Z:dqdr Z;dsdt Z<dudv Z=dwdx Z>dydz Z?d{d| Z@d}d~ ZAdd ZBdd ZCdd ZDdS )BlastParsera  Parse XML BLAST data into a Blast object.

    Parses XML output from BLAST (direct use discouraged).
    This (now) returns a list of Blast records.
    Historically it returned a single Blast record.
    You are expected to use this via the parse or read functions.

    All XML 'action' methods are private methods and may be:

    - ``_start_TAG`` called when the start tag is found
    - ``_end_TAG`` called when the end tag is found

    r   c                 C   s   t | | tj | _| j|  | jtjjj	d | jtjjj
d | jtjjjd | jtjjjd d| _|   dS )r   r   r`   N)r   r   xmlsaxmake_parser_parsersetContentHandler
setFeaturehandlerfeature_validationfeature_namespacesfeature_external_pesfeature_external_ges_xml_versionresetr   r   r   r   r   S  s   zBlastParser.__init__c                 C   s"   g | _ t | _t | _d| j_dS )z>Reset all the data allowing reuse of the BlastParser() object.N)_recordsr   _headerr~   _parametersfilterr   r   r   r   r   j  s   zBlastParser.resetc                 C   s4   |dkr
|    d S |dkr|   d S td| )NBlastOutput	BlastXML2zOInvalid root node name: %s. Root node should be either BlastOutput or BlastXML2)_setup_blast_v1_setup_blast_v2r   r   r   r   r   r   q  s   zBlastParser._on_root_nodec                 C   s  i d| j d| jd| jd| jd| jd| jd| jd| jd	| jd
| j	d| j
d| jd| jd| jd| jd| jd| ji d| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd | jd!| j d"| j!i d#| j"d$| j#d%| j$d&| j%d'| j&d(| j'd)| j(d*| j)d+| j*d,| j+d-| j,d.| j-d/| j.d0| j/d1| j0d2| j1| _2d S )3Nstart_Iterationend_Iterationend_BlastOutput_programend_BlastOutput_versionend_BlastOutput_referenceend_BlastOutput_dbzend_BlastOutput_query-IDzend_BlastOutput_query-defzend_BlastOutput_query-lenzend_Iteration_query-IDzend_Iteration_query-defzend_Iteration_query-lenend_BlastOutput_hitsend_Parameters_matrixend_Parameters_expectzend_Parameters_sc-matchzend_Parameters_sc-mismatchzend_Parameters_gap-openzend_Parameters_gap-extendend_Parameters_filter	start_Hitend_Hit
end_Hit_idend_Hit_defend_Hit_accessionend_Hit_len	start_Hspend_Hsp_scorezend_Hsp_bit-scoreend_Hsp_evaluezend_Hsp_query-fromzend_Hsp_query-tozend_Hsp_hit-fromzend_Hsp_hit-tozend_Hsp_query-framezend_Hsp_hit-frameend_Hsp_identityend_Hsp_positiveend_Hsp_gapszend_Hsp_align-lenend_Hsp_qseqend_Hsp_hseqend_Hsp_midlinezend_Statistics_db-numzend_Statistics_db-lenzend_Statistics_hsp-lenzend_Statistics_eff-spaceend_Statistics_kappaend_Statistics_lambdaend_Statistics_entropy)3_start_blast_record_end_blast_record_set_header_application_set_header_version_set_header_reference_set_header_database_set_header_query_id_set_header_query_set_header_query_letters_set_record_query_id_set_record_query_def_set_record_query_letters_set_record_hits_set_parameters_matrix_set_parameters_expect_set_parameters_sc_match_set_parameters_sc_mismatch_set_parameters_gap_penalties_set_parameters_gap_extend_set_parameters_filter
_start_hit_end_hit
set_hit_idset_hit_defset_hit_accessionset_hit_len
_start_hsp_set_hsp_score_set_hsp_bit_score_set_hsp_e_value_set_hsp_query_start_set_hsp_query_end_set_hsp_hit_from_set_hsp_hit_to_set_hsp_query_frame_set_hsp_hit_frame_set_hsp_identity_set_hsp_positive_set_hsp_gaps_set_hsp_align_len_set_hsp_query_seq_set_hsp_subject_seq_set_hsp_midline_set_statistics_db_num_set_statistics_db_len_set_statistics_hsp_len_set_statistics_eff_space_set_statistics_kappa_set_statistics_lambda_set_statistics_entropyr   r   r   r   r   r   |  s   	
 !"#$%&'()*+,-./012zBlastParser._setup_blast_v1c                 C   s  d| _ d| _i d| jd| jd| jd| jd| jd| jd| jd	| j	d
| j
d| jd| jd| jd| jd| jd| jd| jd| ji d| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd| jd | j d!| j!d"| j"d#| j#i d$| j$d%| j%d&| j&d'| j'd(| j(d)| j)d*| j*d+| j+d,| j,d-| j-d.| j.d/| j/d0| j0d1| j1d2| j2d3| j3d4| j4| j5| j6| j7d5| _8d S )6Nr   zstart_report/Reportzend_report/Reportzend_Report/programzend_Report/versionzend_Report/referencezend_Target/dbzend_Search/query-idzend_Search/query-titlezend_Search/query-lenr   zend_Parameters/matrixzend_Parameters/expectzend_Parameters/sc-matchzend_Parameters/sc-mismatchzend_Parameters/gap-openzend_Parameters/gap-extendzend_Parameters/filterzstart_hits/Hitzend_hits/Hitzstart_description/HitDescrzend_description/HitDescrzend_HitDescr/idzend_HitDescr/accessionzend_HitDescr/titlezend_HitDescr/taxidzend_HitDescr/scinamezend_Hit/lenzstart_hsps/Hspzend_hsps/Hspzend_Hsp/scorezend_Hsp/bit-scorezend_Hsp/evaluezend_Hsp/query-fromzend_Hsp/query-tozend_Hsp/hit-fromzend_Hsp/hit-tozend_Hsp/query-framezend_Hsp/hit-framezend_Hsp/query-strandzend_Hsp/hit-strandzend_Hsp/identityzend_Hsp/positivezend_Hsp/gapszend_Hsp/align-lenzend_Hsp/qseqzend_Hsp/hseqzend_Hsp/midlinezend_Statistics/db-numzend_Statistics/db-lenzend_Statistics/hsp-lenzend_Statistics/eff-space)zend_Statistics/kappazend_Statistics/lambdazend_Statistics/entropy)9r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  _start_hit_descr_item_end_hit_descr_item_end_description_id_end_description_accession_end_description_title_end_description_taxid_end_description_scinamer  r  _end_hspr  r  r  r  r  r  r  r  r  _set_hsp_query_strand_set_hsp_hit_strandr  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r   r   r   r   r   r     s   	
 !"#$%&'()*+,-./01234zBlastParser._setup_blast_v2c                 C      t  | _dS )zStart interaction (PRIVATE).N)r   _blastr   r   r   r   r        zBlastParser._start_blast_recordc                 C   s>  | j j| j_| j j| j_| j j| j_| j j| j_| j j| j_t| jdr(| jjs.| j j| j_t| jdr8| jj	s>| j j	| j_	t| jdrH| jj
sN| j j
| j_
| jj
| j_| jj| j_| jj| j_| jj| j_| jj| j_| jj| j_| jj| j_| jj| j_| jj| j_| jj| j_| j| j d| _| jrtd dS dS )zEnd interaction (PRIVATE).r   r   r   Nz&NCBIXML: Added Blast record to results)r   r   r/  r   r   r   r   hasattrr   r   r   r   ry   r   rz   r   r   r   r   r   r   rM   r   r   r   r7   r   r   r   r   r   r   r     s4   zBlastParser._end_blast_recordc                 C   s   | j  | j_dS )zsBLAST program, e.g., blastp, blastn, etc. (PRIVATE).

        Save this to put on each blast record object
        N)r   upperr   r   r   r   r   r   r   *  s   z#BlastParser._set_header_applicationc                 C   sn   | j  }|d | j_t|dkr5|d d dkr-|d d dkr-|d dd | j_dS |d | j_dS dS )	zVersion number and date of the BLAST engine (PRIVATE).

        e.g. "BLASTX 2.2.12 [Aug-07-2005]" but there can also be
        variants like "BLASTP 2.2.18+" without the date.

        Save this to put on each blast record object
        r`   r   r   r   []N)r   rG   r   r   r5   r   )r   partsr   r   r   r   1  s   
 zBlastParser._set_header_versionc                 C      | j | j_dS )zRecord any article reference describing the algorithm (PRIVATE).

        Save this to put on each blast record object
        N)r   r   r   r   r   r   r   r   H     z!BlastParser._set_header_referencec                 C   r7  )ziRecord the database(s) searched (PRIVATE).

        Save this to put on each blast record object
        N)r   r   r   r   r   r   r   r   O  r8  z BlastParser._set_header_databasec                 C   r7  )zRecord the identifier of the query (PRIVATE).

        Important in old pre 2.2.14 BLAST, for recent versions
        <Iteration_query-ID> is enough
        N)r   r   r   r   r   r   r   r   V     z BlastParser._set_header_query_idc                 C   r7  )zRecord the definition line of the query (PRIVATE).

        Important in old pre 2.2.14 BLAST, for recent versions
        <Iteration_query-def> is enough
        N)r   r   r   r   r   r   r   r   ^  r9  zBlastParser._set_header_queryc                 C      t | j| j_dS )zRecord the length of the query (PRIVATE).

        Important in old pre 2.2.14 BLAST, for recent versions
        <Iteration_query-len> is enough
        N)intr   r   r   r   r   r   r   r   f  s   z%BlastParser._set_header_query_lettersc                 C   r7  )z-Record the identifier of the query (PRIVATE).N)r   r/  r   r   r   r   r   r   n     z BlastParser._set_record_query_idc                 C   r7  )z2Record the definition line of the query (PRIVATE).N)r   r/  r   r   r   r   r   r   r  r<  z!BlastParser._set_record_query_defc                 C   r:  )z)Record the length of the query (PRIVATE).N)r;  r   r/  r   r   r   r   r   r   v  r@   z%BlastParser._set_record_query_lettersc                 C   r:  )zAHits to the database sequences, one for every sequence (PRIVATE).N)r;  r   r/  r   r   r   r   r   r     r@   zBlastParser._set_record_hitsc                 C   r7  )z+Matrix used (-M on legacy BLAST) (PRIVATE).N)r   r   r   r   r   r   r   r     r<  z"BlastParser._set_parameters_matrixc                 C   r7  )zExpect values cutoff (PRIVATE).N)r   r   rM   r   r   r   r   r     s   	z"BlastParser._set_parameters_expectc                 C   r:  )z@Match score for nucleotide-nucleotide comparison (-r) (PRIVATE).N)r;  r   r   r   r   r   r   r   r    r@   z$BlastParser._set_parameters_sc_matchc                 C   r:  )zEMismatch penalty for nucleotide-nucleotide comparison (-r) (PRIVATE).N)r;  r   r   r   r   r   r   r   r    r@   z'BlastParser._set_parameters_sc_mismatchc                 C   r:  )z"Gap existence cost (-G) (PRIVATE).N)r;  r   r   r   r   r   r   r   r    r@   z)BlastParser._set_parameters_gap_penaltiesc                 C   s   | j jt| jf| j _dS )z"Gap extension cose (-E) (PRIVATE).N)r   r   r;  r   r   r   r   r   r    s   z&BlastParser._set_parameters_gap_extendc                 C   r7  )z(Record filtering options (-F) (PRIVATE).N)r   r   r   r   r   r   r   r    r<  z"BlastParser._set_parameters_filterc                 C   s   | j jt  | jdkrt nt | _| j j| j g | j _	| j jd | _
d| j_| j dkr?td| j d| _dS dS )z Start filling records (PRIVATE).r`   r4  r   CREATE_VIEWr   r   N)r/  rs   r7   rA   r   r%   r/   _descrr   rt   _hitr*   r   r   r   r   r   r   r   r    s   
zBlastParser._start_hitc                 C   s   d| j _d| _d| _dS )zClear variables (PRIVATE).N)r/  rt   r?  r>  r   r   r   r   r    s   
zBlastParser._end_hitc                 C   s   | j | j_| j d | j_dS )z9Record the identifier of the database sequence (PRIVATE).r,   N)r   r?  rB   r&   r   r   r   r   r    s   
zBlastParser.set_hit_idc                 C   s,   | j | j_| j j| j 7  _| jj| j_dS )z>Record the definition line of the database sequence (PRIVATE).N)r   r?  rC   r&   r>  r   r   r   r   r	    s   
zBlastParser.set_hit_defc                 C   s   | j | j_| j | j_dS )z>Record the accession value of the database sequence (PRIVATE).N)r   r?  r=   r>  r   r   r   r   r
    s   
zBlastParser.set_hit_accessionc                 C   r:  )zRecord the length of the hit.N)r;  r   r?  rD   r   r   r   r   r    r@   zBlastParser.set_hit_lenc                 C   sR   t  | _d | j_| jj| j | j jd7  _| jj	t
  | jj	d | _d S )Nr`   r4  )rK   _hsprO   r?  rE   r7   r>  r*   r/  rt   r]   _mult_alr   r   r   r   r    s   zBlastParser._start_hspc                 C   s4   | j jrt| j jdkr| j  jd7  _d S d S d S )Nr`   r   )r@  rS   r5   r   r   r   r   r+    s   zBlastParser._end_hspc                 C   0   t | j| j_| jjdu rt | j| j_dS dS )z&Record the raw score of HSP (PRIVATE).N)floatr   r@  r'   r>  r   r   r   r   r       zBlastParser._set_hsp_scorec                 C   rB  )z&Record the Bit score of HSP (PRIVATE).N)rC  r   r@  r(   r>  r   r   r   r   r    rD  zBlastParser._set_hsp_bit_scorec                 C   s0   t | j| j_| jjdu rt | j| j_dS dS )z-Record the expect value of the HSP (PRIVATE).N)rC  r   r@  rM   r>  r)   r   r   r   r   r    rD  zBlastParser._set_hsp_e_valuec                 C   r:  )zEOffset of query at the start of the alignment (one-offset) (PRIVATE).N)r;  r   r@  rT   r   r   r   r   r    r@   z BlastParser._set_hsp_query_startc                 C   r:  )zCOffset of query at the end of the alignment (one-offset) (PRIVATE).N)r;  r   r@  rU   r   r   r   r   r    r@   zBlastParser._set_hsp_query_endc                 C   r:  )zLOffset of the database at the start of the alignment (one-offset) (PRIVATE).N)r;  r   r@  rX   r   r   r   r   r    r@   zBlastParser._set_hsp_hit_fromc                 C   r:  )zJOffset of the database at the end of the alignment (one-offset) (PRIVATE).N)r;  r   r@  rY   r   r   r   r   r    r@   zBlastParser._set_hsp_hit_toc                 C   s>   t | j}|f| j_| jjdkr|dkrdndf| j_dS dS )+Frame of the query if applicable (PRIVATE).BLASTNr   PlusMinusN)r;  r   r@  rS   r   r   rR   r   vr   r   r   r  %  s
   

z BlastParser._set_hsp_query_framec                 C   sl   t | j}t| jjdkrd|f| j_n	| j j|f7  _| jjdkr4| j j|dkr-dndf7  _dS dS )7Frame of the database sequence if applicable (PRIVATE).r   rF  rG  rH  N)r;  r   r5   r@  rS   r   r   rR   rI  r   r   r   r  ,  s   
"zBlastParser._set_hsp_hit_framec                 C   s8   | j f| j_| jjdkr| j dkrdndf| j_dS dS )rE  rF  rG  r`   r4  N)r   r@  rR   r   r   rS   r   r   r   r   r,  6  s   z!BlastParser._set_hsp_query_strandc                 C   sH   | j  j| jf7  _| jjdkr"| j  j| jdkrdndf7  _dS dS )rK  rF  rG  r`   r4  N)r@  rR   r   r   r   rS   r   r   r   r   r-  <  s   $zBlastParser._set_hsp_hit_strandc                 C   s.   t | j}|| j_| jjdu r|| j_dS dS )z;Record the number of identities in the alignment (PRIVATE).N)r;  r   r@  rN   rO   rI  r   r   r   r  B  s
   
zBlastParser._set_hsp_identityc                 C   r:  )zVRecord the number of positive (conservative) substitutions in the alignment (PRIVATE).N)r;  r   r@  rO   r   r   r   r   r  I  r@   zBlastParser._set_hsp_positivec                 C   r:  )z5Record the number of gaps in the alignment (PRIVATE).N)r;  r   r@  rP   r   r   r   r   r  M  r@   zBlastParser._set_hsp_gapsc                 C   r:  )z-Record the length of the alignment (PRIVATE).N)r;  r   r@  rQ   r   r   r   r   r  Q  r@   zBlastParser._set_hsp_align_lenc                 C   r7  )z4Record the alignment string for the query (PRIVATE).N)r   r@  r   r   r   r   r   r  Y  r<  zBlastParser._set_hsp_query_seqc                 C   r7  )z7Record the alignment string for the database (PRIVATE).N)r   r@  rW   r   r   r   r   r  ]  r<  z BlastParser._set_hsp_subject_seqc                 C   sF   | j | j_t| jjt| jjksJ t| jjt| jjks!J dS )zBRecord the middle line as normally seen in BLAST report (PRIVATE).N)r   r@  rV   r5   r   rW   r   r   r   r   r  a  s   
 zBlastParser._set_hsp_midlinec                 C   r:  )z9Record the number of sequences in the database (PRIVATE).N)r;  r   r/  rz   r   r   r   r   r  h  r@   z"BlastParser._set_statistics_db_numc                 C   r:  )z7Record the number of letters in the database (PRIVATE).N)r;  r   r/  ry   r   r   r   r   r  l  r@   z"BlastParser._set_statistics_db_lenc                 C   r:  )z*Record the effective HSP length (PRIVATE).N)r;  r   r/  r   r   r   r   r   r  p  r@   z#BlastParser._set_statistics_hsp_lenc                 C   r:  )z,Record the effective search space (PRIVATE).N)rC  r   r/  r   r   r   r   r   r   t  r@   z%BlastParser._set_statistics_eff_spacec                 C   r:  )z&Karlin-Altschul parameter K (PRIVATE).NrC  r   r/  r{   r   r   r   r   r!  x  r@   z!BlastParser._set_statistics_kappac                 C   s   t | j| jjf| j_dS )z+Karlin-Altschul parameter Lambda (PRIVATE).NrL  r   r   r   r   r"  |  s   z"BlastParser._set_statistics_lambdac                 C   s   | j jt| jf | j _dS )z&Karlin-Altschul parameter H (PRIVATE).N)r/  r{   rC  r   r   r   r   r   r#    s   z#BlastParser._set_statistics_entropyc                 C   r.  z#XML v2. Start hit description item.N)r;   _hit_descr_itemr   r   r   r   r$    r0  z!BlastParser._start_hit_descr_itemc                 C   s.   | j | j | jjst| j| j_d| _dS rM  )r>  r9   rN  r?  r&   r6   r   r   r   r   r%    s   
zBlastParser._end_hit_descr_itemc                 C   s$   | j | j_| jjs| j | j_dS dS )z9XML v2. The identifier of the database sequence(PRIVATE).N)r   rN  r<   r?  rB   r   r   r   r   r&    s   
zBlastParser._end_description_idc                 C   s*   | j | j_t| jdds| j | j_dS dS )z?XML v2. The accession value of the database sequence (PRIVATE).r=   N)r   rN  r=   getattrr?  r   r   r   r   r'    s   
z&BlastParser._end_description_accessionc                 C   r7  )z,XML v2. The hit description title (PRIVATE).N)r   rN  r&   r   r   r   r   r(    r<  z"BlastParser._end_description_titlec                 C   s*   z
t | j| j_W d S  ty   Y d S w ra   )r;  r   rN  r>   r   r   r   r   r   r)    s
   z"BlastParser._end_description_taxidc                 C   s   | j | j_d S ra   )r   rN  r?   r   r   r   r   r*    s   z$BlastParser._end_description_scinameNr   )Er    r!   r"   r#   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r  r	  r
  r  r  r+  r  r  r  r  r  r  r  r  r  r,  r-  r  r  r  r  r  r  r  r  r  r  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r*  r   r   r   r   r   D  s    
6<8	

r   c                 C   sV   t | |}zt|}W n ty   tddw zt| td ty*   Y |S w )aO  Return a single Blast record (assumes just one query).

    Uses the BlastParser internally.

    This function is for use when there is one and only one BLAST
    result in your XML file.

    Use the Bio.Blast.NCBIXML.parse() function if you expect more than
    one BLAST record (i.e. if you have more than one query sequence).
    zNo records found in handleNz$More than one record found in handle)parsenextStopIterationr   )handler   iteratorrecordr   r   r   read  s   

rV  c                 c   s   ddl m} d}d}d}d}d}d}| |}	t|	tr%d}d	}d
}d
}|	s+td|	r|	|s>td||	dd f | }
t|}|j	|
_
|j|
_|j|
_|
|	d |jrn|jd }|jdd |_|V  |js[	 || | d}	}|	s|
|d nK| |}|| |	| vr|
|	d |jr|jdV  |jsn)|	| || d\}	}|| }|
|	d |jr|jdV  |js||}	}nqo|rJ |t|jdksJ t|j|	s-|	rJ |	|rJ |t|jdksJ t|jdS )a  Return an iterator a Blast record for each query.

    Incremental parser, this is an iterator that returns
    Blast records.  It uses the BlastParser internally.

    handle - file handle to and XML file to parse
    debug - integer, amount of debug information to print

    This is a generator function that returns multiple Blast records
    objects - one for each query sequence given to blast.  The file
    is read incrementally, returning complete records as they are read
    in.

    Should cope with new BLAST 2.2.14+ which gives a single XML file
    for multiple query records.

    Should also cope with XML output from older versions BLAST which
    gave multiple XML files concatenated together (giving a single file
    which strictly speaking wasn't valid XML).
    r   )expati   
   z<?xmlrF   r   s   <?xml   
    zYour XML file was emptyz5Your XML file did not start with %r... but instead %rN   Fr`   T)xml.parsersrW  rV  
isinstancebytesr   
startswithParserCreater   r   StartElementHandlerr   EndElementHandlerr   CharacterDataHandlerParser   r   rG   r5   )rS  r   rW  BLOCKMARGIN	XML_STARTNEW_LINENULLpendingtextexpat_parserblast_parserrU  r   r   r   rP    sz   





%= rP  )r   r   r   )r#   xml.saxr   xml.sax.handlerr   	Bio.Alignr   Bio.Seqr   Bio.SeqRecordr   r   r   r%   r/   r;   rA   rK   r]   ro   rv   r~   r   r   r   r   rV  rP  r   r   r   r   <module>   s6   	
!h.Cj    
i