o
    RŀgY                     @   sT  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddl	mZ d	Zd
ZdZdZdZdZdd Zdd Zdd Zdd Zdd ZedZ	d8ddZdd edD Zd d! Zd"d# Zg d$Zd%d& Zd'd( Z d)d* Z!G d+d, d,Z"G d-d. d.e
Z#d/d0 Z$G d1d2 d2e#Z%G d3d4 d4eZ&e'd5krdd6l(m)Z) e)dd7 dS dS )9a$  Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.

SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
Biomedical Research and the Wellcome Trust Sanger Institute. SFF was also used
as the native output format from early versions of Ion Torrent's PGM platform
as well. You are expected to use this module via the Bio.SeqIO functions under
the format name "sff" (or "sff-trim" as described below).

For example, to iterate over the records in an SFF file,

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
    ...
    E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
    E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
    E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
    E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
    E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
    E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
    E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
    E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
    E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
    E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...

Each SeqRecord object will contain all the annotation from the SFF file,
including the PHRED quality scores.

    >>> print("%s %i" % (record.id, len(record)))
    E3MFGYR02F7Z7G 219
    >>> print("%s..." % record.seq[:10])
    tcagAATCAT...
    >>> print("%r..." % (record.letter_annotations["phred_quality"][:10]))
    [22, 21, 23, 28, 26, 15, 12, 21, 28, 21]...

Notice that the sequence is given in mixed case, the central upper case region
corresponds to the trimmed sequence. This matches the output of the Roche
tools (and the 3rd party tool sff_extract) for SFF to FASTA.

    >>> print(record.annotations["clip_qual_left"])
    4
    >>> print(record.annotations["clip_qual_right"])
    134
    >>> print(record.seq[:4])
    tcag
    >>> print("%s...%s" % (record.seq[4:20], record.seq[120:134]))
    AATCATCCACTTTTTA...CAAAACACAAACAG
    >>> print(record.seq[134:])
    atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn

The annotations dictionary also contains any adapter clip positions
(usually zero), and information about the flows. e.g.

    >>> len(record.annotations)
    12
    >>> print(record.annotations["flow_key"])
    TCAG
    >>> print(record.annotations["flow_values"][:10])
    (83, 1, 128, 7, 4, 84, 6, 106, 3, 172)
    >>> print(len(record.annotations["flow_values"]))
    400
    >>> print(record.annotations["flow_index"][:10])
    (1, 2, 3, 2, 2, 0, 3, 2, 3, 3)
    >>> print(len(record.annotations["flow_index"]))
    219

Note that to convert from a raw reading in flow_values to the corresponding
homopolymer stretch estimate, the value should be rounded to the nearest 100:

    >>> print("%r..." % [int(round(value, -2)) // 100
    ...                  for value in record.annotations["flow_values"][:10]])
    ...
    [1, 0, 1, 0, 0, 1, 0, 1, 0, 2]...

If a read name is exactly 14 alphanumeric characters, the annotations
dictionary will also contain meta-data about the read extracted by
interpreting the name as a 454 Sequencing System "Universal" Accession
Number. Note that if a read name happens to be exactly 14 alphanumeric
characters but was not generated automatically, these annotation records
will contain nonsense information.

    >>> print(record.annotations["region"])
    2
    >>> print(record.annotations["time"])
    [2008, 1, 9, 16, 16, 0]
    >>> print(record.annotations["coords"])
    (2434, 1658)

As a convenience method, you can read the file with SeqIO format name "sff-trim"
instead of "sff" to get just the trimmed sequences (without any annotation
except for the PHRED quality scores and anything encoded in the read names):

    >>> from Bio import SeqIO
    >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
    ...
    E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
    E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
    E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
    E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
    E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
    E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
    E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
    E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
    E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
    E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...

Looking at the final record in more detail, note how this differs to the
example above:

    >>> print("%s %i" % (record.id, len(record)))
    E3MFGYR02F7Z7G 130
    >>> print("%s..." % record.seq[:10])
    AATCATCCAC...
    >>> print("%r..." % record.letter_annotations["phred_quality"][:10])
    [26, 15, 12, 21, 28, 21, 36, 28, 27, 27]...
    >>> len(record.annotations)
    4
    >>> print(record.annotations["region"])
    2
    >>> print(record.annotations["coords"])
    (2434, 1658)
    >>> print(record.annotations["time"])
    [2008, 1, 9, 16, 16, 0]
    >>> print(record.annotations["molecule_type"])
    DNA

You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.

    >>> from Bio import SeqIO
    >>> from io import StringIO
    >>> out_handle = StringIO()
    >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
    ...                       out_handle, "fastq")
    ...
    >>> print("Converted %i records" % count)
    Converted 10 records

The output FASTQ file would start like this:

    >>> print("%s..." % out_handle.getvalue()[:50])
    @E3MFGYR02JWQ7T
    tcagGGTCTACATGTTGGTTAACCCGTACTGATT...

Bio.SeqIO.index() provides memory efficient random access to the reads in an
SFF file by name. SFF files can include an index within the file, which can
be read in making this very fast. If the index is missing (or in a format not
yet supported in Biopython) the file is indexed by scanning all the reads -
which is a little slower. For example,

    >>> from Bio import SeqIO
    >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
    >>> record = reads["E3MFGYR02JHD4H"]
    >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
    E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
    >>> reads.close()

Or, using the trimmed reads:

    >>> from Bio import SeqIO
    >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
    >>> record = reads["E3MFGYR02JHD4H"]
    >>> print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
    E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
    >>> reads.close()

You can also use the Bio.SeqIO.write() function with the "sff" format. Note
that this requires all the flow information etc, and thus is probably only
useful for SeqRecord objects originally from reading another SFF file (and
not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").

As an example, let's pretend this example SFF file represents some DNA which
was pre-amplified with a PCR primers AAAGANNNNN. The following script would
produce a sub-file containing all those reads whose post-quality clipping
region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
degenerate bit of this pretend primer):

    >>> from Bio import SeqIO
    >>> records = (record for record in
    ...            SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
    ...            if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
    ...
    >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
    >>> print("Selected %i records" % count)
    Selected 2 records

Of course, for an assembly you would probably want to remove these primers.
If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
if you want SFF output we have to preserve all the flow information - the trick
is just to adjust the left clip position!

    >>> from Bio import SeqIO
    >>> def filter_and_trim(records, primer):
    ...     for record in records:
    ...         if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
    ...             record.annotations["clip_qual_left"] += len(primer)
    ...             yield record
    ...
    >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
    >>> count = SeqIO.write(filter_and_trim(records, "AAAGA"),
    ...                     "temp_filtered.sff", "sff")
    ...
    >>> print("Selected %i records" % count)
    Selected 2 records

We can check the results, note the lower case clipped region now includes the "AAAGA"
sequence:

    >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
    ...
    E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
    E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
    >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
    ...     print("%s %i %s..." % (record.id, len(record), record.seq[:20]))
    ...
    E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
    E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
    >>> import os
    >>> os.remove("temp_filtered.sff")

For a description of the file format, please see the Roche manuals and:
http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats

    N)StreamModeError)Seq)	SeqRecord   )SequenceIterator)SequenceWriter    s   .sffs   .hshs   .srts   .mft   c                 C   s  d}dt |ksJ | d}|stdt|dk r tdzt ||\}}}}}}}	}
}}}}W n ty@   tddw |tt	t
fv rLtd|tkrWtd|||||fd	kritd
||||f |dkrstd| |dk|	dkA rtd||	f | |d}| |d}|d dksJ || | d }d|  krdk sJ | J || |t|krddl}ddlm} |d| | |||	|
|||fS )a  Read in an SFF file header (PRIVATE).

    Assumes the handle is at the start of the file, will read forwards
    though the header and leave the handle pointing at the first record.
    Returns a tuple of values from the header (header_length, index_offset,
    index_length, number_of_reads, flows_per_read, flow_chars, key_sequence)

    >>> with open("Roche/greek.sff", "rb") as handle:
    ...     values = _sff_file_header(handle)
    ...
    >>> print(values[0])
    840
    >>> print(values[1])
    65040
    >>> print(values[2])
    256
    >>> print(values[3])
    24
    >>> print(values[4])
    800
    >>> values[-1]
    'TCAG'

    z>4s4BQIIHHHB   zEmpty file.z*File too small to hold a valid SFF header.z(SFF files must be opened in binary mode.Nz0Handle seems to be at SFF index block, not startz#SFF file did not start '.sff', but )r   r   r   r   z.Unsupported SFF version in header, %i.%i.%i.%ir   z%Flowgram format code %i not supportedr   z#Index offset %i but index length %iASCII   BiopythonParserWarningzQYour SFF file is invalid, post header %i byte null padding region contained data.)structcalcsizeread
ValueErrorlenunpack	TypeErrorr   _hsh_srt_mft_sffdecodecount_nullwarningsBior   warn)handlefmtdatamagic_numberver0ver1ver2ver3index_offsetindex_lengthnumber_of_readsheader_length
key_lengthnumber_of_flows_per_readflowgram_format
flow_charskey_sequencepaddingr   r    r2   C/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqIO/SffIO.py_sff_file_header   sz   %



$r4   c                 c   s   |  d t| \}}}}}}}d}t|}	d| }
t|
}dtdks*J dtdks3J dtdks<J |	d dksDJ t|D ]}|  }||krq|| }|d rb|d|d  7 }|d dksjJ |  | |}| |	}t||\}}}}}}}|d	k s|d dkrtd
||f | |	 }||	 | }| |
t|krddl}ddlm} |d| | || |  ksJ |d|  }|  |d |d }|rd| }| |
t|krddl}ddlm} |d| | ||fV  qH|  d dkrtddS )zGenerate an index by scanning though all the reads in an SFF file (PRIVATE).

    This is a slow but generic approach if we can't parse the provided index
    (if present).

    Will use the handle seek/tell functions.
    r   >2HI4H>%iHr   >B>s>cr   
   z,Malformed read header, says length is %i:
%rNr   IYour SFF file is invalid, post name %i byte padding region contained data   LYour SFF file is invalid, post quality %i byte padding region contained dataz4After scanning reads, did not end on a multiple of 8)seekr4   r   r   rangetellr   r   r   r   r   r   r   r   r   r   )r    r+   r(   r)   r*   r-   r/   r0   read_header_fmtread_header_sizeread_flow_fmtread_flow_sizer   record_offsetoffsetr"   read_header_lengthname_lengthseq_lenclip_qual_leftclip_qual_rightclip_adapter_leftclip_adapter_rightnamer1   r   r   sizer2   r2   r3   _sff_do_slow_indexf  s   
	




	rP   c              	   C   s  |  d t| \}}}}}}}|  |ksJ |r|s td|  | d}t|}	| |	}
|
s;td||f t|
|	k rJtd|||
f t||
\}}}}}|t	kr||||fdkrktd||||f d}t|}t|| |\}}||	| | | krtd	||	|||f ||||||	 | |||	 | | |fS |t
kr||||fdkrtd
||||f | d}
|
td krtd||||dd||	 d ||	 d fS |tkrtdtd|d|
)a  Locate any existing Roche style XML meta data and read index (PRIVATE).

    Makes a number of hard coded assumptions based on reverse engineered SFF
    files from Roche 454 machines.

    Returns a tuple of read count, SFF "index" offset and size, XML offset
    and size, and the actual read index offset and size.

    Raises a ValueError for unsupported or non-Roche index blocks.
    r   z!No index present in this SFF filez>4s4BzLPremature end of file? Expected index of size %i at offset %i, found nothingzGPremature end of file? Expected index of size %i at offset %i, found %r)1   .   0   rS   z5Unsupported version in .mft index header, %i.%i.%i.%iz>LLz@Problem understanding .mft index header, %i != %i + %i + %i + %iz5Unsupported version in .srt index header, %i.%i.%i.%i   z3Did not find expected null four bytes in .srt indexzDHash table style indexes (.hsh) in SFF files are not (yet) supportedzUnknown magic number z in SFF index header:
)r>   r4   r@   r   r   r   r   r   r   r   r   r   r   )r    r+   r(   r)   r*   r-   r/   r0   r!   fmt_sizer"   r#   r$   r%   r&   r'   fmt2	fmt2_sizexml_size	data_sizer2   r2   r3   _sff_find_roche_index  s   
	











rZ   c           	      C   s@   t | \}}}}}}}}|r|std| | | | S )a  Read any Roche style XML manifest data in the SFF "index".

    The SFF file format allows for multiple different index blocks, and Roche
    took advantage of this to define their own index block which also embeds
    an XML manifest string. This is not a publicly documented extension to
    the SFF file format, this was reverse engineered.

    The handle should be to an SFF file opened in binary mode. This function
    will use the handle seek/tell functions and leave the handle in an
    arbitrary location.

    Any XML manifest found is returned as a Python string, which you can then
    parse as appropriate, or reuse when writing out SFF files with the
    SffWriter class.

    Returns a string, or raises a ValueError if an Roche manifest could not be
    found.
    zNo XML manifest found)rZ   r   r>   r   r   )	r    r*   r+   r(   r)   
xml_offsetrX   read_index_offsetread_index_sizer2   r2   r3   ReadRocheXmlManifest&  s   
r^   c                 c   s   t | \}}}}}}}}| | d}	t|D ]]}
| d}	 | d}|s+td||7 }|tkr4nq |dd tksEJ |dd |dd  }t|	|dd \}}}}}|d	|  d
|  d|  }|rptd||fV  q| 	 || krtd| 	 || f dS )a  Read any existing Roche style read index provided in the SFF file (PRIVATE).

    Will use the handle seek/tell functions.

    This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.

    Roche SFF indices use base 255 not 256, meaning we see bytes in range the
    range 0 to 254 only. This appears to be so that byte 0xFF (character 255)
    can be used as a marker character to separate entries (required if the
    read name lengths vary).

    Note that since only four bytes are used for the read offset, this is
    limited to 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile
    tool to combine SFF files beyond this limit, they issue a warning and
    omit the index (and manifest).
    z>5B   Tr   zPremature end of file!Ni      z,Expected a null terminator to the read name.z#Problem with index length? %i vs %i)
rZ   r>   r?   r   r   _flagr   r   r   r@   )r    r*   r+   r(   r)   r[   rX   r\   r]   r!   r   r"   morerN   off4off3off2off1off0rF   r2   r2   r3   _sff_read_roche_indexJ  sJ   


 rk   z^[a-zA-Z0-9]{14}$Fc              	   C   s  d}t |}d| }t |}t || |\}	}
}}}}}|r&|d8 }|r,|d8 }|	dk s6|	d dkr<td|	 | |
 }|	| |
 }| |t|kreddl}dd	l	m
} |d
| | | |}d| }| |}| |}tt || |}||d  d }|rd| }| |t|krddl}dd	l	m
} |d| | t||}|r|rt||}n
|}n|r|}n|}|r||krddl}dd	l	m
} |d| d}g }n|||  }||| }i }nE||krddl}dd	l	m
} |d| | }n|d|  |||   ||d   }t ||t ||||||||d}tt|rPt||d< t||d< t||d< d|d< tt|||d|d}t|jd| |S )zFParse the next read in the file, return data as a SeqRecord (PRIVATE).r5   r6   r   r:   r   r   (Malformed read header, says length is %iNr   r;   >%iBr<   r=   z9Overlapping clip values in SFF record, trimmed to nothing z%Overlapping clip values in SFF record)flow_values
flow_indexr/   flow_keyrJ   rK   rL   rM   timeregioncoordsDNAmolecule_type)idrN   descriptionannotationsphred_quality)r   r   r   r   r   r   r   r   r   r   r   r   listmaxminupperlowerrematch_valid_UAN_read_name_get_read_time_get_read_region_get_read_xyr   r   dict__setitem___per_letter_annotations)r    r-   r/   r0   trimrA   rB   rC   rD   rG   rH   rI   rJ   rK   rL   rM   rN   r1   r   r   ro   temp_fmtrp   seqquals	clip_left
clip_rightry   recordr2   r2   r3   _sff_read_seq_record  s   











r   c                 C   s   g | ]}d | qS )$   r2   ).0ir2   r2   r3   
<listcomp>  s    r   r_   c                 C   s   d}t | ddd tD ]K\}}dt|  krdkr%n nt|d }n,dt|  kr1dkr:n nt|d }nd	t|  krFd
krOn nt|d	 }nd}||| 7 }q|S )zCInterpret a string as a base-36 number as per 454 manual (PRIVATE).r   Nr`   rS   9      A   Z   a   z   )zip_powers_of_36ord)stringtotalcpowervalr2   r2   r3   _string_as_base_36  s   r   c                 C   s   t | dd }t|dS )zBExtract coordinates from last 5 characters of read name (PRIVATE).	   Ni   )r   divmod)	read_namenumberr2   r2   r3   r   /  s   
r   )i p$i 0* iQ i  <   c                 C   sT   g }t | dd }tD ]}t||\}}|| q|| |d  d7  < |S )z<Extract time from first 6 characters of read name (PRIVATE).Nr_   r   i  )r   _time_denominatorsr   append)r   	time_list	remainderdenominator	this_termr2   r2   r3   r   >  s   
r   c                 C   s   t | d S )z(Extract region from read name (PRIVATE).r   )int)r   r2   r2   r3   r   J  s   r   c                 C   s<  d}t |}d| }t |}| |}t ||\}}}	|dk s(|d dkr.td| || d| 7 }|| d | }
| |
}|t|
kr]ddl}ddlm	} |
d	|
 | ||7 }|| ||	d
  7 }||	d
  d }
|
rd|
 }
| |
}|t|
krddl}ddlm	} |
d|
 | ||7 }|S )zDExtract the next read in the file as a raw (bytes) string (PRIVATE).z>2HIr6   r:   r   r   rl   Nr   r;   r<   r=   )r   r   r   r   r   r   r   r   r   r   r   )r    r-   rA   rB   rC   rD   rawrG   rH   rI   r1   padr   r   r2   r2   r3   _sff_read_raw_recordO  sN   




r   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )_AddTellHandlea  Wrapper for handles which do not support the tell method (PRIVATE).

    Intended for use with things like network handles where tell (and reverse
    seek) are not supported. The SFF file needs to track the current offset in
    order to deal with the index block.
    c                 C   s   || _ d| _d S )Nr   )_handle_offset)selfr    r2   r2   r3   __init__  s   
z_AddTellHandle.__init__c                 C   s"   | j |}|  jt|7  _|S N)r   r   r   r   )r   lengthr"   r2   r2   r3   r     s   z_AddTellHandle.readc                 C   s   | j S r   )r   r   r2   r2   r3   r@     s   z_AddTellHandle.tellc                 C   s(   || j k r	td| j|| j   d S )NzCan't seek backwards)r   RuntimeErrorr   r   )r   rF   r2   r2   r3   r>     s   
z_AddTellHandle.seekc                 C   s
   | j  S r   )r   closer   r2   r2   r3   r     s   
z_AddTellHandle.closeN)	__name__
__module____qualname____doc__r   r   r@   r>   r   r2   r2   r2   r3   r     s    r   c                       s2   e Zd ZdZd
 fdd	Zdd Zdd	 Z  ZS )SffIteratorz0Parser for Standard Flowgram Format (SFF) files.NFc                    s,   |durt dt j|ddd || _dS )av  Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).

            - source - path to an SFF file, e.g. from Roche 454 sequencing,
              or a file-like object opened in binary mode.
            - alphabet - optional alphabet, unused. Leave as None.
            - trim - should the sequences be trimmed?

        The resulting SeqRecord objects should match those from a paired FASTA
        and QUAL file converted from the SFF file using the Roche 454 tool
        ssfinfo. i.e. The sequence will be mixed case, with the trim regions
        shown in lower case.

        This function is used internally via the Bio.SeqIO functions:

        >>> from Bio import SeqIO
        >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
        ...     print("%s %i" % (record.id, len(record)))
        ...
        E3MFGYR02JWQ7T 265
        E3MFGYR02JA6IL 271
        E3MFGYR02JHD4H 310
        E3MFGYR02GFKUC 299
        E3MFGYR02FTGED 281
        E3MFGYR02FR9G7 261
        E3MFGYR02GAZMS 278
        E3MFGYR02HHZ8O 221
        E3MFGYR02GPGB1 269
        E3MFGYR02F7Z7G 219

        You can also call it directly:

        >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle:
        ...     for record in SffIterator(handle):
        ...         print("%s %i" % (record.id, len(record)))
        ...
        E3MFGYR02JWQ7T 265
        E3MFGYR02JA6IL 271
        E3MFGYR02JHD4H 310
        E3MFGYR02GFKUC 299
        E3MFGYR02FTGED 281
        E3MFGYR02FR9G7 261
        E3MFGYR02GAZMS 278
        E3MFGYR02HHZ8O 221
        E3MFGYR02GPGB1 269
        E3MFGYR02F7Z7G 219

        Or, with the trim option:

        >>> with open("Roche/E3MFGYR02_random_10_reads.sff", "rb") as handle:
        ...     for record in SffIterator(handle, trim=True):
        ...         print("%s %i" % (record.id, len(record)))
        ...
        E3MFGYR02JWQ7T 260
        E3MFGYR02JA6IL 265
        E3MFGYR02JHD4H 292
        E3MFGYR02GFKUC 295
        E3MFGYR02FTGED 277
        E3MFGYR02FR9G7 256
        E3MFGYR02GAZMS 271
        E3MFGYR02HHZ8O 150
        E3MFGYR02GPGB1 221
        E3MFGYR02F7Z7G 130

        Nz,The alphabet argument is no longer supportedbSFF)moder!   )r   superr   r   )r   sourcealphabetr   	__class__r2   r3   r     s   A
zSffIterator.__init__c                 C   sJ   zd|  krtd|   W n ty   t|}Y nw | |}|S )z9Start parsing the file, and return a SeqRecord generator.r   zNot at start of file, offset %i)r@   r   AttributeErrorr   iterate)r   r    recordsr2   r2   r3   parse  s   
zSffIterator.parsec                 c   s   | j }t|\}}}}}}}	d}
t|
}d| }t|}dtdks(J dtdks1J dtdks:J |d dksBJ t|D ]2}|ro| |kro|| }|d r`|d|d  7 }|d dkshJ || d}t||||	|V  qFt||| d	S )
z.Parse the file and generate SeqRecord objects.r5   r6   r   r7   r8   r9   r   r   N)	r   r4   r   r   r?   r@   r>   r   
_check_eof)r   r    r   r+   r(   r)   r*   r-   r/   r0   rA   rB   rC   rD   r   rF   r2   r2   r3   r     s>   	



zSffIterator.iterate)NF)r   r   r   r   r   r   r   __classcell__r2   r2   r   r3   r     s
    Fr   c                 C   s  |   }d}d}|r>||kr>||k rtd|| ||f | || |  || }||   kr>td||   ||| f |d rMd|d  }| |}|dkra|dd tkratd	||f |ry|syddl}dd
lm} |d| | dS |t	|krddl}dd
lm} |d||f | |   }|d dkrtd||d f | d}|tkrtd| |rtd| dS )zCheck final padding is OK (8 byte alignment) and file ends (PRIVATE).

    Will attempt to spot apparent SFF file concatenation and give an error.

    Will not attempt to seek, only moves the handle forward.
        r   zHGap of %i bytes after final record end %i, before %i where index starts?z$Wanted %i, got %i, index is %i to %ir   rT   NzYour SFF file is invalid, post index %i byte null padding region ended '.sff' which could be the start of a concatenated SFF file? See offset %ir   z]Your SFF file is technically invalid as it is missing a terminal %i byte null padding region.zSYour SFF file is invalid, post index %i byte null padding region contained data: %rz%Wanted offset %i %% 8 = %i to be zerozZAdditional data at end of SFF file, perhaps multiple SFF files concatenated? See offset %iz1Additional data at end of SFF file, see offset %i)
r@   r   r   r   r   r   r   r   r   r   )r    r(   r)   rF   extrar1   r   r   r2   r2   r3   r   $  sv   

r   c                       s    e Zd ZdZ fddZ  ZS )_SffTrimIteratorzFIterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE).c                    s   t  j|dd d S )NT)r   )r   r   )r   r   r   r2   r3   r   x  s   z_SffTrimIterator.__init__)r   r   r   r   r   r   r2   r2   r   r3   r   u  s    r   c                       sB   e Zd ZdZd fdd	Zdd Zdd	 Zd
d Zdd Z  Z	S )	SffWriterzSFF file writer.TNc                    s,   t  |d || _|rg | _dS d| _dS )a~  Initialize an SFF writer object.

        Arguments:
         - target - Output stream opened in binary mode, or a path to a file.
         - index - Boolean argument, should we try and write an index?
         - xml - Optional string argument, xml manifest to be recorded
           in the index block (see function ReadRocheXmlManifest for
           reading this data).

        wbN)r   r   _xml_index)r   targetindexxmlr   r2   r3   r     s
   

zSffWriter.__init__c                 C   s  zt || _W n ty$   d| _t| jdrt| jds"tddY nw | jdurBt| jdr6t| jdsBddl}|d d| _d| _	d| _
t|dsQt|}zt|}W n tyb   d}Y nw |du rktdz|jd	 d
| _|jd d
| _t | j| _W n ty   tddw |   | | d}|D ]}| | |d7 }q| jdkr| j }| jd || _|   | j| n|| jksJ | jdur|   |S )z>Use this to write an entire file containing the given records.r   r>   r@   zA handle with a seek/tell methods is required in order to record the total record count in the file header (once it is known at the end).NzNA handle with a seek/tell methods is required in order to record an SFF index.nextzMust have at least one sequencerq   r   r/   zMissing SFF flow informationr   )r   _number_of_readsr   hasattrr    r   r   r   r   _index_start_index_lengthiterr   StopIterationry   encode_key_sequence_flow_chars_number_of_flows_per_readKeyErrorwrite_headerwrite_recordr@   r>   _write_index)r   r   r   r   r   rF   r2   r2   r3   
write_file  sp   










zSffWriter.write_filec                 C   sr  t | j| jks
J | j}| j  | | _| jd ur"| j }nddl	m
} d| d}|d7 }|d7 }| }t |}d}t|}|t| |  d}d	t|ksYJ | j  d}| jD ]v\}	}
|
}|d
 }||8 }|d }||8 }|d }||8 }|
|| | | krtd|
||||f |d |d |d
 |f\}}}}|d
k r|d
k r|d
k r|d
k std|
||||f ||	t|d||||d
  |t |	d	 7 }qc|| | | _| jd rd| jd  }|t|  nd}| }
|
| j| j | krtd|
| j| j|f || j |t|ddddd|||  |d |   ||
 d S )Nr   )__version__z)<!-- This file was output with Biopython z -->
zD<!-- This XML and index block attempts to mimic Roche SFF files -->
zB<!-- This file may be a combination of multiple SFF files etc -->
z>I4BLLz>6Br_   ra   rb   rc   z%i -> %i %i %i %ir   z%i vs %i + %i + %iitfm.rQ   rR   rS   )r   r   r   r    sortr@   r   r   r   r   r   r   r   writer   r   packr   r>   r   )r   r    r   r   xml_lenr!   rU   rV   	index_lenrN   rF   rg   rj   ri   rh   r1   r2   r2   r3   r     s   




  


zSffWriter._write_indexc                 C   s   t | j}d| j|f }t|d dkrd}n	dt|d  }t|| }|d dks0J t|ddddd| j| j| j||| jd| j	| j}| j
|t|   dS )zWrite the SFF file header.z>I4BQIIHHHB%is%isr   r   iffs.r   N)r   r   r   r   r   r   r   r   r   r   r    r   r   )r   r,   r!   r1   r+   headerr2   r2   r3   r   (  s2   
zSffWriter.write_headerc                 C   s
  |j  }t|}t|j }t|}z|jd }W n ty+   td|j  dw z$|j	d }|j	d }| j
|j	d  ksK| j|j	d  krOtdW n ty`   td	|j  d tyk   td
dw zR|j	d }	|	dk r~td|j  |	r|	d7 }	|j	d }
|
dk rtd|j  |j	d }|dk rtd|j  |r|d7 }|j	d }|dk rtd|j  W n ty   td|j  dw | jdur| j }|dkrddl}|d||f  d| _n| j|| j f d| }t|d dkr
d}n	dt|d  }t|| }|d dks#J t|||||	|
|||	t|  }t||ks=J d| j }t|}d| }|tj|g|R  tj|g|R   | tj|g|R   7 }||d  d }|ryd| }| j|t|   dS )zmWrite a single additional record to the output file.

        This assumes the header has been done.
        rz   z(Missing PHRED qualities information for Nro   rp   rq   r/   z'Records have inconsistent SFF flow dataz!Missing SFF flow information for zHeader not written yet?rJ   r   z&Negative SFF clip_qual_left value for r   rK   z'Negative SFF clip_qual_right value for rL   z)Negative SFF clip_adapter_left value for rM   z*Negative SFF clip_adapter_right value for z%Missing SFF clipping information for l    |x zzRead %s has file offset %i, which is too large to store in the Roche SFF index structure. No index block will be recorded.z	>2HI4H%isr   r6   rm   r<   )rw   r   r   bytesr   r~   letter_annotationsr   r   ry   r   r   r   r   r    r@   r   r   r   r   r   r   r   r   r   )r   r   rN   name_lenr   rI   r   ro   rp   rJ   rK   rL   rM   rF   r   rA   r1   rG   r"   rC   rD   r   r2   r2   r3   r   V  s   















zSffWriter.write_record)TN)
r   r   r   r   r   r   r   r   r   r   r2   r2   r   r3   r   |  s    >Y.r   __main__)run_doctest)verbose)F)*r   r   r   r   r   Bio.Seqr   Bio.SeqRecordr   
Interfacesr   r   r   r   r   r   r   rd   r4   rP   rZ   r^   rk   compiler   r   r?   r   r   r   r   r   r   r   r   r   r   r   r   r   
Bio._utilsr   r2   r2   r2   r3   <module>   sV    dj[e$
9
 	3 Q  _