o
    Rŀg3                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddlm
Z
 ed	Zed
ZdZdd ZG dd de
ZG dd de	ZdS )aU  Bio.AlignIO support for "xmfa" output from Mauve/ProgressiveMauve.

You are expected to use this module via the Bio.AlignIO functions (or the
Bio.SeqIO functions if you want to work directly with the gapped sequences).

For example, consider a progressiveMauve alignment file containing the following::

    #FormatVersion Mauve1
    #Sequence1File	a.fa
    #Sequence1Entry	1
    #Sequence1Format	FastA
    #Sequence2File	b.fa
    #Sequence2Entry	2
    #Sequence2Format	FastA
    #Sequence3File	c.fa
    #Sequence3Entry	3
    #Sequence3Format	FastA
    #BackboneFile	three.xmfa.bbcols
    > 1:0-0 + a.fa
    --------------------------------------------------------------------------------
    --------------------------------------------------------------------------------
    --------------------------------------------------------------------------------
    > 2:5417-5968 + b.fa
    TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACGTGAGAGGAGCGCCCTAAGCTTTGGGAAATTCAAGC-
    --------------------------------------------------------------------------------
    CTGGAACGTACTTGCTGGTTTCGCTACTATTTCAAACAAGTTAGAGGCCGTTACCTCGGGCGAACGTATAAACCATTCTG
    > 3:9476-10076 - c.fa
    TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-GGGAGGAGATCGCCCCAAACGTATGGTGAGTCGGGCG
    TTTCCTATAGCTATAGGACCAATCCACTTACCATACGCCCGGCGTCGCCCAGTCCGGTTCGGTACCCTCCATGACCCACG
    ---------------------------------------------------------AAATGAGGGCCCAGGGTATGCTT
    =
    > 2:5969-6015 + b.fa
    -----------------------
    GGGCGAACGTATAAACCATTCTG
    > 3:9429-9476 - c.fa
    TTCGGTACCCTCCATGACCCACG
    AAATGAGGGCCCAGGGTATGCTT

This is a multiple sequence alignment with multiple aligned sections, so you
would probably load this using the Bio.AlignIO.parse() function:

    >>> from Bio import AlignIO
    >>> align = AlignIO.parse("Mauve/simple_short.xmfa", "mauve")
    >>> alignments = list(align)
    >>> for aln in alignments:
    ...     print(aln)
    ...
    Alignment with 3 rows and 240 columns
    --------------------------------------------...--- a.fa
    TTTAAACATCCCTCGGCCCGTCGCCCTTTTATAATAGCAGTACG...CTG b.fa/5416-5968
    TTTAAACACCTTTTTGGATG--GCCCAGTTCGTTCAGTTGTG-G...CTT c.fa/9475-10076
    Alignment with 2 rows and 46 columns
    -----------------------GGGCGAACGTATAAACCATTCTG b.fa/5968-6015
    TTCGGTACCCTCCATGACCCACGAAATGAGGGCCCAGGGTATGCTT c.fa/9428-9476

Additional information is extracted from the XMFA file and available through
the annotation attribute of each record::

    >>> for record in alignments[0]:
    ...     print(record.id, len(record))
    ...     print("  start: %d, end: %d, strand: %d" %(
    ...         record.annotations['start'], record.annotations['end'],
    ...         record.annotations['strand']))
    ...
    a.fa 240
      start: 0, end: 0, strand: 1
    b.fa/5416-5968 240
      start: 5416, end: 5968, strand: 1
    c.fa/9475-10076 240
      start: 9475, end: 10076, strand: -1

    N)MultipleSeqAlignment)Seq)	SeqRecord   )AlignmentIterator)SequentialAlignmentWriterzG> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>.*)z]> (?P<id>\d+):(?P<start>\d+)-(?P<end>\d+) (?P<strand>[+-]) (?P<name>[^#]*) # (?P<realname>.*)z<> {seq_name}:{start}-{end} {strand} {filename} # {ugly_hack}c                 C   s8   |  d\}}}tt| d\}}|d8 }||||fS )zDReturn (name, start, end) string tuple from an identifier (PRIVATE).:-r   )splitmapint)
identifieridlocstrandstartend r   G/var/www/html/myenv/lib/python3.10/site-packages/Bio/AlignIO/MauveIO.py_identifier_splitb   s   r   c                       s2   e Zd ZdZ fddZdd Zd	ddZ  ZS )
MauveWriterzMauve/XMFA alignment writer.c                    s"   t  j|i | d| _d| _dS )zInitialize the class.FN)super__init___wrote_header_wrote_first)selfargskwargs	__class__r   r   r   m   s   
zMauveWriter.__init__c                 C   s   t |}| | _|dkrtd| jdkrtd| js=d| _| jd td|d D ]}| jd| d| d	 q-t|D ]\}}| j	||d
 qA| jd dS )zUse this to write (another) single alignment to an open file.

        Note that sequences and their annotation are recorded
        together (rather than having a block of annotation followed
        by a block of aligned sequences).
        r   zMust have at least one sequencez Non-empty sequences are requiredTz#FormatVersion Mauve1
r   z	#SequencezEntry	
)
record_idxz=
N)
lenget_alignment_length_length_of_sequences
ValueErrorr   handlewriterange	enumerate_write_record)r   	alignmentcountiidxrecordr   r   r   write_alignments   s   

zMauveWriter.write_alignmentr   c           	      C   sB  | j t|jkrtd|j}z	tt|j}W n ty'   t|d }Y nw d|jv rxd|jv rxd|jd  d|jd  }d|jd d  d|jd  }|t| d |krd|dt|  }|t| d |krx|dt|  }d|jv rd|jv rd|jv rtj	||jd d |jd |jd dkrd	nd|jd
 |j
d}d}ntj	|ddd	|jd
 |j
d}d}d|v sd|v r|s| jsd| _tj	|ddd	|jd
 |j
d}|dddd}| j|d  dS dS |dddd}| j|d  tdt|jdD ]}| j|j||d   d qdS )z/Write a single SeqRecord to the file (PRIVATE).z%Sequences must all be the same lengthr   r   r   /r	   Nr   +z.fa)seq_namer   r   r   filename	ugly_hackFr   Tz:0-0 z:1-0 r     z

P   )r$   r"   seqr%   namestrr   annotationsID_LINE_FMTformatr   r   replacer&   r'   r(   )	r   r/   r!   r3   suffix0suffix1id_linelacking_annotationsr-   r   r   r   r*      sn    


$zMauveWriter._write_record)r   )__name__
__module____qualname____doc__r   r0   r*   __classcell__r   r   r   r   r   j   s
    !r   c                   @   s*   e Zd ZU dZg Zee ed< dd ZdS )MauveIteratorzMauve xmfa alignment iterator._idsc              	   C   s2  | j }| }|st|r!| dr!| }|r!| dsi }i }d}d}	 |s-n| }|dr7n|drt|}|sOt|}|sOtd||	d}i }	d	D ].}
z#|	|
}|
d
krpt
|}|dkrp|d8 }|
dkrxt
|}||	|
< W qX ty   Y qXw |	||< || jvr| j| ||d |}n|rJ |du rtd||  |7  < | }q*t|t| jksJ | j| _|| _| jr|rtttt| }g }| jD ]}||vst|| dkst|| dkrd| }n|| }|t|krtd||vrq|| d
 dks#|| d dkrNdjdi || }d|| v r;|| d }n|| d }||dkrM||7 }nd|| v r\|| d }n|| d }tt|||d}|| d
 |jd
< || d |jd< || d dkrdnd|jd< || qt|S t)z)Parse the next alignment from the handle.#FNT=>zMalformed header line: %sr   )r   r   r   r   r:   realnamer   r   r   r    z#Saw sequence before definition liner	   z8Sequences have different lengths, or repeated identifierz/{start}-{end}rN   r:   )r   r:   r   r2   r   )r&   readlineStopIterationstrip
startswithXMFA_HEADER_REGEX_BIOPYTHONmatchXMFA_HEADER_REGEXr%   groupr   
IndexErrorrJ   append
setdefaultr"   ids	sequencesmaxr   listvaluesr>   r,   r   r   r<   r   )r   r&   lineseqsseq_regionspassed_end_alignment	latest_idm	parsed_idparsed_datakeyvaluealignment_lengthrecordsr   r9   suffixcorrected_idr/   r   r   r   __next__   s   







0
(

$zMauveIterator.__next__N)	rD   rE   rF   rG   rJ   r_   r;   __annotations__ro   r   r   r   r   rI      s   
 rI   )rG   re	Bio.Alignr   Bio.Seqr   Bio.SeqRecordr   
Interfacesr   r   compilerW   rU   r=   r   r   rI   r   r   r   r   <module>   s"   Iu