o
    Rŀge                     @   s<  d Z ddlZddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dZ	e
d	Zd
Ze
eZe
e Ze
dZe
dZe
dZdefdefdefdefdefdefdefdefdefdefd
ZdZdZdZdZd(ddZdd Zd d! ZG d"d# d#ZG d$d% d%eZed&krdd'l m!Z! e!  dS dS ))a(  Bio.SearchIO support for Bill Pearson's FASTA tools.

This module adds support for parsing FASTA outputs. FASTA is a suite of
programs that finds regions of local or global similarity between protein
or nucleotide sequences, either by searching databases or identifying
local duplications.

Bio.SearchIO.FastaIO was tested on the following FASTA flavors and versions:

    - flavors: fasta, ssearch, tfastx
    - versions: 35, 36

Other flavors and/or versions may introduce some bugs. Please file a bug report
if you see such problems to Biopython's bug tracker.

More information on FASTA are available through these links:

    - Website: http://fasta.bioch.virginia.edu/fasta_www2/fasta_list2.shtml
    - User guide: http://fasta.bioch.virginia.edu/fasta_www2/fasta_guide.pdf


Supported Formats
=================

Bio.SearchIO.FastaIO supports parsing and indexing FASTA outputs triggered by
the -m 10 flag. Other formats that mimic other programs (e.g. the BLAST tabular
format using the -m 8 flag) may be parseable but using SearchIO's other parsers
(in this case, using the 'blast-tab' parser).


fasta-m10
=========

Note that in FASTA -m 10 outputs, HSPs from different strands are considered to
be from different hits. They are listed as two separate entries in the hit
table. FastaIO recognizes this and will group HSPs with the same hit ID into a
single Hit object, regardless of strand.

FASTA also sometimes output extra sequences adjacent to the HSP match. These
extra sequences are discarded by FastaIO. Only regions containing the actual
sequence match are extracted.

The following object attributes are provided:

+-----------------+-------------------------+----------------------------------+
| Object          | Attribute               | Value                            |
+=================+=========================+==================================+
| QueryResult     | description             | query sequence description       |
|                 +-------------------------+----------------------------------+
|                 | id                      | query sequence ID                |
|                 +-------------------------+----------------------------------+
|                 | program                 | FASTA flavor                     |
|                 +-------------------------+----------------------------------+
|                 | seq_len                 | full length of query sequence    |
|                 +-------------------------+----------------------------------+
|                 | target                  | target search database           |
|                 +-------------------------+----------------------------------+
|                 | version                 | FASTA version                    |
+-----------------+-------------------------+----------------------------------+
| Hit             | seq_len                 | full length of the hit sequence  |
+-----------------+-------------------------+----------------------------------+
| HSP             | bitscore                | \*_bits line                     |
|                 +-------------------------+----------------------------------+
|                 | evalue                  | \*_expect line                   |
|                 +-------------------------+----------------------------------+
|                 | ident_pct               | \*_ident line                    |
|                 +-------------------------+----------------------------------+
|                 | init1_score             | \*_init1 line                    |
|                 +-------------------------+----------------------------------+
|                 | initn_score             | \*_initn line                    |
|                 +-------------------------+----------------------------------+
|                 | opt_score               | \*_opt line, \*_s-w opt line     |
|                 +-------------------------+----------------------------------+
|                 | pos_pct                 | \*_sim line                      |
|                 +-------------------------+----------------------------------+
|                 | sw_score                | \*_score line                    |
|                 +-------------------------+----------------------------------+
|                 | z_score                 | \*_z-score line                  |
+-----------------+-------------------------+----------------------------------+
| HSPFragment     | aln_annotation          | al_cons block, if present        |
| (also via HSP)  +-------------------------+----------------------------------+
|                 | hit                     | hit sequence                     |
|                 +-------------------------+----------------------------------+
|                 | hit_end                 | hit sequence end coordinate      |
|                 +-------------------------+----------------------------------+
|                 | hit_start               | hit sequence start coordinate    |
|                 +-------------------------+----------------------------------+
|                 | hit_strand              | hit sequence strand              |
|                 +-------------------------+----------------------------------+
|                 | query                   | query sequence                   |
|                 +-------------------------+----------------------------------+
|                 | query_end               | query sequence end coordinate    |
|                 +-------------------------+----------------------------------+
|                 | query_start             | query sequence start coordinate  |
|                 +-------------------------+----------------------------------+
|                 | query_strand            | query sequence strand            |
+-----------------+-------------------------+----------------------------------+

    N)SearchIndexer)Hit)HSP)HSPFragment)QueryResult)FastaM10ParserFastaM10Indexerz2t?fast[afmsxy]|pr[sf][sx]|lalign|[gs]?[glso]searchz'>>>(.+?)\s+(.*?) *- (\d+) (?:aa|nt)\s*$z^; [a-z]+(_[ \w-]+):\s+(.*)$z^-*z-*$initn_scoreinit1_score	opt_scorez_scorebitscoreevaluesw_score	ident_pctpos_pct)
_initn_init1_optz_s-w optz_z-score_bits_expect_score_ident_sim          c                 C   sP   |D ]#}| dd\}}|| vr%t|| j}t|g}t|g}| | q| S )z;Append Hits without alignments into QueryResults (PRIVATE). r   )splitr   idr   r   append)qresulthit_rowshit_rowhit_id	remainderfraghsphitr   r   H/var/www/html/myenv/lib/python3.10/site-packages/Bio/SearchIO/FastaIO.py_set_qresult_hits   s   


r+   c                 C   s   d}dD ]@}d|vrD|| }t |\}}ttt|d d}ttt|d d}|| }|| | }|d || || d< qt|d d t|d d kritdt|d d t|d d f d| jv r| jd |d	 | jd< t| jd t|d d ksJ |d d
 |d d
 ksJ |d d
 }	|	dkrdnd}
t	| j
d|
 dD ]]}t|| d }t|| d }t	| j
|d t||d  t	| j
|d t|| t	| j
||| d  |
dkr||krt	| j
|d d qt	| j
|d d qt	| j
|d d qd	S )a  Set HSPs sequences (PRIVATE).

    :param hsp: HSP whose properties will be set
    :type hsp: HSP
    :param parsed: parsed values of the HSP attributes
    :type parsed: dictionary {string: object}
    :param program: program name
    :type program: string

    r   )r)   querytfastseqr,   r)   zLength mismatch: %r %r
similarityN_typeDDNAproteinmolecule_type_start_stopr   _end_strand)_get_aln_slice_coordslenresearch_RE_START_EXCgroup_RE_END_EXC
ValueErroraln_annotationsetattrfragmentintminmax)r(   parsedprogramstartseq_typepseqstop	start_adjstop_adjtype_valr4   endr   r   r*   _set_hsp_seqs   sH    
"
rR   c                 C   s   | d }| d}t| d }t| d }t| d }||kr*|| }|| d }n
|| }|| d }||d7 }d|krI||k rI|t|ksStd|||| f ||fS )	a)  Get HSPs sequences (PRIVATE).

    To get the actual pairwise alignment sequences, we must first
    translate the un-gapped sequence based coordinates into positions
    in the gapped sequence (which may have a flanking region shown
    using leading - characters).  To date, I have never seen any
    trailing flanking region shown in the m10 file, but the
    following code should also cope with that.

    Note that this code seems to work fine even when the "sq_offset"
    entries are present as a result of using the -X command line option.
    r.   -_display_startr5   r6   r   r   z.Problem with sequence start/stop,
%s[%i:%i]
%s)striprE   countr;   rA   )
parsed_hspr.   seq_stripped
disp_startrJ   rM   r   r   r*   r:      s$   

r:   c                   @   sB   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dS )r   z5Parser for Bill Pearson's FASTA suite's -m 10 output.Fc                 C   s   || _ |  | _dS zInitialize the class.N)handle_parse_preamble	_preamble)selfr[    _FastaM10Parser__parse_hit_tabler   r   r*   __init__  s   zFastaM10Parser.__init__c                 c   s"    |   D ]	}|j|_|V  qdS )z8Iterate over FastaM10Parser object yields query results.N)_parse_qresultdescription)r^   r"   r   r   r*   __iter__  s
   zFastaM10Parser.__iter__c                 C   sh   i }	 | j  }|drn!|dr|dd |d< ntt| }|r.|d|d< q|| _	|S )	z@Parse the Fasta preamble for Fasta flavor and version (PRIVATE).TQueryz versionr   r   versionr   rI   )
r[   readline
startswithr   r<   match	_RE_FLAVSlowerr?   line)r^   preamblerk   
flav_matchr   r   r*   r\     s   


zFastaM10Parser._parse_preamblec                 C   s4   g }	 | j  }|r| rn|d q|| _|S )zParse hit table rows.T )r[   rf   rU   r!   rk   )r^   r#   rk   r   r   r*   __parse_hit_table0  s   

z FastaM10Parser.__parse_hit_tablec                 c   s    d}g }d}d}d}d}| j }	 |dr|}n'| dks!|s$|}n|d	s0d	|v r0|}n|d	r>| d
kr>|}nd}|dur||krS|  }| j }n||kr^t||V  n||kr|durlt||V  tt	|}	|	
d}
|	
d}|	
d}t|
d}t||_| j }dd |dD d  |_|dur||_| j D ]
\}}t||| q| j }nN||kr|dd |jsJ || |
D ],\}}|j|_|j|_|j|vr|| q|jD ]}||jksJ ||j | qq| j }n| j }q|| _ dS )zParse query result (PRIVATE).Nr   r         TzThe best scores are:>>>///>>>>>><<<r   )r    c                 S   s   g | ]}|r|qS r   r   ).0xr   r   r*   
<listcomp>p  s    z1FastaM10Parser._parse_qresult.<locals>.<listcomp>r   )rk   rg   rU   r_   r[   rf   r+   r<   r=   _RE_ID_DESC_SEQLENr?   r   rE   seq_lenr   targetrb   r]   itemsrC   r    
_parse_hitquery_descriptionr!   hspsquery_strand)r^   r"   r#   state_QRES_NEWstate_QRES_HITTABstate_QRES_CONTENTstate_QRES_ENDrk   
qres_stateregxquery_idry   desckeyvaluer)   strandr(   r   r   r*   ra   <  sr   












DzFastaM10Parser._parse_qresultc                 c   s   	 | j  }|drnqt}d}g }d}d}d}d}		 | j  | _| j dv s5| jdsrd| jv rr|tkrF|d d  | 7  < n|tkrV|jd  |d	7  < t	||| j
d
  t|}
||
_|	|
_|
|fV  g }dS |dr|rt	||| j
d
  t|}
||
_|	|
_|
|fV  g }z|dd  dd\}}W n ty   |dd  ddd }d}Y nw t||}t|g}|| t}i i d}n-|drt	||| j
d
  t||}t|g}|| t}i i d}n|drF|tkr'||dd dd st|d|t}d|d d< n|tkrE||dd dd s=J t}d|d d< n|drUt}d|jjd< n|drtt| }|d}|d}|tkr|tv rt| \}}|tur||}|dv r|d9 }t||| nl|tkr||d |< n`|tkr|dkrt|}	nQ||d |< nJtd| d|vsJ |tkr|d d  | 7  < n+|tkr|d d  | 7  < n|tkr|jjd  |d	7  < ntd| | j}q)z(Parse hit on query identifier (PRIVATE).Tz>>N)rt   rr   rs   r)   r.   r/   z
rI   r   r   r   r   rn   )r,   r)   z>-->z vs r,   z	; al_cons;)r   r   d   _lenzUnexpected line: %r)r[   rf   rg   _STATE_NONErk   rU   _STATE_HIT_BLOCK_STATE_CONS_BLOCKrB   rR   r]   r   rb   ry   r   rA   r   r   r!   _STATE_QUERY_BLOCKrD   r<   r=   _RE_ATTRr?   _HSP_ATTR_MAPstrrC   rE   )r^   r   rk   stater   hsp_listr(   rW   hit_descry   r)   r%   r'   r   namer   	attr_namecasterr   r   r*   r|     s   






 








"












zFastaM10Parser._parse_hitN)F)
__name__
__module____qualname____doc__r`   rc   r\   r_   ra   r|   r   r   r   r*   r     s    
Sr   c                   @   s,   e Zd ZdZeZdd Zdd Zdd ZdS )	r   z<Indexer class for Bill Pearson's FASTA suite's -m 10 output.c                 C   s   t | | dS rZ   )r   r`   )r^   filenamer   r   r*   r`     s   zFastaM10Indexer.__init__c                 c   s    | j }|d | }d}d}| }	 | }||s6||v r6tt|}|d	 }|t
| }|dur^|sF|||| fV  dS | }||s]||v r]|||| fV  |}n| }q)zXIterate over FastaM10Indexer; yields query results' keys, start offsets, offset lengths.r   N   >>>Tr   )_handleseektellrf   rg   r<   r=   _RE_ID_DESC_SEQLEN_IDXr?   decoder;   )r^   r[   start_offsetqresult_key
query_markrk   
end_offsetr   r   r   r*   rc     s0   
zFastaM10Indexer.__iter__c                 C   s   | j }d}d}|d | }	 ||7 }| }||s#||v r#nq|| | }	 |s5	 |d S ||7 }| }||sK||v rK	 |d S q.)z6Return the raw record from the file as a bytes string.    r   r   Ts   >>><<<
)r   r   rf   rg   )r^   offsetr[   qresult_rawr   rk   r   r   r*   get_raw5  s.   

	zFastaM10Indexer.get_rawN)	r   r   r   r   r   _parserr`   rc   r   r   r   r   r*   r     s    r   __main__)run_doctest)r   )"r   r<   Bio.SearchIO._indexr   Bio.SearchIO._modelr   r   r   r   __all__compileri   _PTR_ID_DESC_SEQLENrx   encoder   r   r>   r@   rE   floatr   r   r   r   r   r+   rR   r:   r   r   r   
Bio._utilsr   r   r   r   r*   <module>   sR   d





?"  G
