o
    RŀgCQ                    @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ G dd	 d	ZG d
d deZG dd deZG dd deZdS )a3  Internal code for parsing GenBank and EMBL files (PRIVATE).

This code is NOT intended for direct use.  It provides a basic scanner
(for use with a event consumer such as Bio.GenBank._FeatureConsumer)
to parse a GenBank or EMBL file (with their shared INSDC feature table).

It is used by Bio.GenBank to parse GenBank files
It is also used by Bio.SeqIO to parse GenBank and EMBL files

Feature Table Documentation:

- http://www.insdc.org/files/feature_table.html
- http://www.ncbi.nlm.nih.gov/projects/collab/FT/index.html
- ftp://ftp.ncbi.nih.gov/genbank/docs/
    N)defaultdict)BiopythonParserWarning)	as_handle)Seq)	SeqRecordc                   @   s   e Zd ZdZdZdZdgZdgZdZdZ	dgZ
d*dd	Zd
d Zdd Zdd Zd+ddZdd Zdd Zdd Zdd Zedd Zdd Zd,d d!Zd,d"d#Zd,d$d%Z	'd-d(d)Zd&S ).InsdcScannera  Basic functions for breaking up a GenBank/EMBL file into sub sections.

    The International Nucleotide Sequence Database Collaboration (INSDC)
    between the DDBJ, EMBL, and GenBank.  These organisations all use the
    same "Feature Table" layout in their plain text flat file formats.

    However, the header and sequence sections of an EMBL file are very
    different in layout to those produced by GenBank/DDBJ.
    XXX   zXXX***FEATURES***XXXzXXX***END FEATURES***XXXr    c                 C   sZ   t | j| jks
J | jD ]
}|| ksJ qt | j| jks"J || _d| _d| _	dS )zInitialize the class.N)
lenRECORD_STARTHEADER_WIDTHSEQUENCE_HEADERSrstripFEATURE_QUALIFIER_SPACERFEATURE_QUALIFIER_INDENTdebughandleline)selfr   marker r   G/var/www/html/myenv/lib/python3.10/site-packages/Bio/GenBank/Scanner.py__init__=   s   

zInsdcScanner.__init__c                 C   s   || _ d| _dS )zSet the handle attribute.r
   N)r   r   )r   r   r   r   r   
set_handleG   s   
zInsdcScanner.set_handlec                 C   s   	 | j r| j }d| _ n| j }|s| jrtd dS t|d tr&td|d| j | j	kr<| jdkr;td|  n,|
 }|d	krN| jdkrMtd
 n|dkr\| jdkr[td n| jdkrgtd|  q|| _ |S )zRead in lines until find the ID/LOCUS line, which is returned.

        Any preamble (such as the header used by the NCBI on ``*.seq.gz`` archives)
        will we ignored.
        Tr
   zEnd of fileNr   z,Is this handle in binary mode not text mode?   zFound the start of a record:
//z&Skipping // marking end of last recordz!Skipping blank line before recordz$Skipping header line before record:
)r   r   readliner   print
isinstanceint
ValueErrorr   r   r   r   r   r   r   r   
find_startL   s:   




zInsdcScanner.find_startc                 C   s   | j d| j | jkrtdg }	 | j }|std| }|| jv r.| jr-t	d n"|d| j  | j
v rB| jrAt	d n|dkrJtd|| q|| _ |S )	zReturn list of strings making up the header.

        New line characters are removed.

        Assumes you have just read in the ID/LOCUS line.
        NzNot at start of recordT*Premature end of line during sequence datazFound feature tableFound start of sequencer   z0Premature end of sequence data marker '//' found)r   r   r   r!   r   r   r   FEATURE_START_MARKERSr   r   r   append)r   header_linesr   r   r   r   parse_headerq   s,   


zInsdcScanner.parse_headerFc                 C   sz  | j  | jvr| jrtd g S | j  | jv r'| j | _ | j  | jv sg }| j }	 |s3td|d| j  | j	v rG| jrFtd n| }|dkrStd|| j
v re| jr_td | j }n|d	| j  d
krv| j }q,t|| jk rtd|t | j }q,|r| j }|d| j | jkr| j }|d| j | jksn|| j dkrd|| jd v r|d	d  dd\}}|g}td| dt n|d	| j  }|| jd g}| j }|d| j | jks|d
kr.| d
kr.||| jd   | j }|d| j | jks|d
kr.| d
ks|| || q-|| _ |S )  Return list of tuples for the features (if present).

        Each feature is returned as a tuple (key, location, qualifiers)
        where key and location are strings (e.g. "CDS" and
        "complement(join(490883..490885,1..879))") while qualifiers
        is a list of two string tuples (feature qualifier keys and values).

        Assumes you have already read to the start of the features table.
        Didn't find any feature tableT+Premature end of line during features tableNr%   r   2Premature end of features table, marker '//' foundFound end of features   r
   z%line too short to contain a feature:  r   zOver indented z	 feature?)r   r   r&   r   r   r   r   r!   r   r   FEATURE_END_MARKERSr   stripr   warningswarnr   r   splitr'   parse_feature)r   skipfeaturesr   feature_keyfeature_linesr   r   r   parse_features   s   









@zInsdcScanner.parse_featuresc              	   C   s  dd |D }zLt |}| }|dd dkr+t |}|| 7 }|dd dks|d|dkritdt |dd dksM|d|dkrit |}|| 7 }|dd dksM|d|dksMg }t|D ]\}}|d	kr|dr|| 7 }qo|d	 d
kr!|d}|d| }	||d d }
|r|
dr|
	 drtdt |
	 }
|dkr|dd }	|
|	df qo|
s|
|	df qo|
dkr| jrtd|	 d|
  |
|	|
f qo|
d	 dkr|
g}|d d dkr|
t | |d d dksd|}
|
|	|
f qo|
|	|
f qot|d	ks*J |	|d d	 ks5J |d d du r@t|	|d d d | f|d< qo|||fW S  tyh   td|d|f dw )a
  Parse a feature given as a list of strings into a tuple.

        Expects a feature as a list of strings, returns a tuple (key, location,
        qualifiers)

        For example given this GenBank feature::

             CDS             complement(join(490883..490885,1..879))
                             /locus_tag="NEQ001"
                             /note="conserved hypothetical [Methanococcus jannaschii];
                             COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
                             localization signal; IPR002743: Protein of unknown
                             function DUF57"
                             /codon_start=1
                             /transl_table=11
                             /product="hypothetical protein"
                             /protein_id="NP_963295.1"
                             /db_xref="GI:41614797"
                             /db_xref="GeneID:2732620"
                             /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
                             EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
                             KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
                             IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
                             EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
                             LNSMGFGFVNTKKNSAR"

        Then should give input key="CDS" and the rest of the data as a list of strings
        lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
        where the leading spaces and trailing newlines have been removed.

        Returns tuple containing: (key as string, location string, qualifiers as list)
        as follows for this example:

        key = "CDS", string
        location = "complement(join(490883..490885,1..879))", string
        qualifiers = list of string tuples:

        [('locus_tag', '"NEQ001"'),
         ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
         ('codon_start', '1'),
         ('transl_table', '11'),
         ('product', '"hypothetical protein"'),
         ('protein_id', '"NP_963295.1"'),
         ('db_xref', '"GI:41614797"'),
         ('db_xref', '"GeneID:2732620"'),
         ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]

        In the above example, the "note" and "translation" were edited for compactness,
        and they would contain multiple new line characters (displayed above as \n)

        If a qualifier is quoted (in this case, everything except codon_start and
        transl_table) then the quotes are NOT removed.

        Note that no whitespace is removed.
        c                 s   s    | ]}|r|V  qd S )Nr   ).0xr   r   r   	<genexpr>"  s    z-InsdcScanner.parse_feature.<locals>.<genexpr>N,()z;Non-standard feature line wrapping (didn't break on comma)?r   /=r   r0   "z%White space after equals in qualifierr
   zSingle quote :
zProblem with '%s' feature:
%s)nextr2   countr3   r4   r   	enumerate
startswithfindlstripr'   r   r   joinr   StopIterationr!   )r   r9   linesiteratorr   feature_location
qualifiersline_numberikeyvalue
value_listr   r   r   r6      s   9

zInsdcScanner.parse_featurec                 C   s   | j | jv r3| j d| j  | jvr3| j | _ | j s td| j  | _ | j d| j  | jvs| j d| j  | jvrDtd	 | j }|sPtd| }|dkrYnqE|| _ g dfS )GReturn a tuple containing a list of any misc strings, and the sequence.NPremature end of filezNot at start of sequenceTr$   r   r
   )r   r1   r   r   r   r   r   r!   r"   r   r   r   parse_footerq  s&   
zInsdcScanner.parse_footerc                 C      dS )zHandle the LOCUS/ID line, passing data to the consumer (PRIVATE).

        This should be implemented by the EMBL / GenBank specific subclass

        Used by the parse_records() and parse() methods.
        Nr   r   consumerr   r   r   r   _feed_first_line      zInsdcScanner._feed_first_linec                 C   r\   )zHandle the header lines (list of strings), passing data to the consumer (PRIVATE).

        This should be implemented by the EMBL / GenBank specific subclass

        Used by the parse_records() and parse() methods.
        Nr   r   r^   rP   r   r   r   _feed_header_lines  r`   zInsdcScanner._feed_header_linesc              	   C   sh   |    |D ]+\}}}| | | | |D ]\}}|du r&| || q| ||dd qqdS )zHandle the feature table (list of tuples), passing data to the consumer (PRIVATE).

        Used by the parse_records() and parse() methods.
        NrG   r0   )start_feature_tabler9   locationfeature_qualifierreplace)r^   feature_tuplesr9   location_stringrS   q_keyq_valuer   r   r   _feed_feature_table  s   

z InsdcScanner._feed_feature_tablec                 C   r\   )zHandle any lines between features and sequence (list of strings), passing data to the consumer (PRIVATE).

        This should be implemented by the EMBL / GenBank specific subclass

        Used by the parse_records() and parse() methods.
        Nr   ra   r   r   r   _feed_misc_lines  r`   zInsdcScanner._feed_misc_linesTc                 C   s   |  | |  sd|_dS | || j | ||   |r*| || jdd n| jdd | 	 \}}| 
|| || |d | jdksMJ dS )a  Feed a set of data into the consumer.

        This method is intended for use with the "old" code in Bio.GenBank

        Arguments:
         - handle - A handle with the information to parse.
         - consumer - The consumer that should be informed of events.
         - do_features - Boolean, should the features be parsed?
           Skipping the features can be much faster.

        Return values:
         - true  - Passed a record
         - false - Did not find a record

        NF)r7   Tr   )r   r#   datar_   r   rb   r)   rk   r;   r[   rl   sequence
record_end)r   r   r^   do_features
misc_linessequence_stringr   r   r   feed  s   


zInsdcScanner.feedc                 C   s>   ddl m} ddlm} |d| d}| |||r|jS dS )zReturn a SeqRecord (with SeqFeatures if do_features=True).

        See also the method parse_records() for use on multi-record files.
        r   )_FeatureConsumer)FeatureValueCleanerr   )use_fuzzinessfeature_cleanerN)Bio.GenBankrt   Bio.GenBank.utilsru   rs   rm   )r   r   rp   rt   ru   r^   r   r   r   parse  s   zInsdcScanner.parsec                 c   s    t |4}	 | ||}|du rn|jdu rtd|jdkr$td|jdkr-td|V  qW d   dS 1 s<w   Y  dS )a	  Parse records, return a SeqRecord object iterator.

        Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord

        The SeqRecord objects include SeqFeatures if do_features=True

        This method is intended for use in Bio.SeqIO
        TNz1Failed to parse the record's ID. Invalid ID line?z<unknown name>z3Failed to parse the record's name. Invalid ID line?z<unknown description>z(Failed to parse the record's description)r   rz   idr!   namedescription)r   r   rp   recordr   r   r   parse_records  s(   




"zInsdcScanner.parse_recordsN
protein_id	locus_tagproductc                 c   s4   |dur	t dt|}| | |  r|   |  }	 | j }|s*n
|dd dkr3nq"| | _	|D ]\}}}|dkrt
dd}	|	j}
d|
d	< |d
d|
d< |D ]c\}}|durt|d dkrt|d dkrt|dd }|dkr|	jdu sJ dt|dd|	_qZ|dkr|	j| qZ|dur|dd
dd
}z|
|  d
| 7  < W qZ ty   ||
|< Y qZw z	|
|d  |	_W n	 ty   Y nw z	|
|d  |	_W n	 ty   Y nw z	|
|d  |	_W n	 ty   Y nw |	V  q;|  sW d   dS W d   dS 1 sw   Y  dS )aj  Parse CDS features, return SeqRecord object iterator.

        Each CDS feature becomes a SeqRecord.

        Arguments:
         - alphabet - Obsolete, should be left as None.
         - tags2id  - Tuple of three strings, the feature keys to use
           for the record id, name and description,

        This method is intended for use in Bio.SeqIO

        Nz,The alphabet argument is no longer supportedTr/   r   CDS)seqproteinmolecule_typer0   r
   raw_locationr   rE   r?   r   translationzMultiple translations!rG   db_xref  )r!   r   r   r#   r)   r;   r   r   r   r   r   annotationsrf   r   r   dbxrefsr'   KeyErrorr{   r|   r}   )r   r   alphabettags2idrg   r   rV   rh   rS   r~   r   qualifier_namequalifier_datar   r   r   parse_cds_features  s   





$zInsdcScanner.parse_cds_features)r   F)T)Nr   )__name__
__module____qualname____doc__r   r   r&   r1   r   r   r   r   r   r#   r)   r;   r6   r[   r_   rb   staticmethodrk   rl   rs   rz   r   r   r   r   r   r   r   )   s6    

%
!W 	


3
r   c                   @   s   e Zd ZdZdZdZddgZdgZdZdd	ed
   Z	ddgZ
eZd	e Zdd Zdd Zdd Zdd Zdd Zdd Zedd Zdd Zdd ZdS ) EmblScannerz3For extracting chunks of information in EMBL files.zID      (FH   Key             Location/QualifiersFHXX   FTr0   r/   SQCOc                 C   s  | j d| j  | jvrtd| j  dg }| j d| j  | jv rK|| j  | j | _ | j s8td| j  | _ | j d| j  | jv s%| j d| j d| j ksg| j  dksgtd| j g }| j }	 |sstd	| }|s}td
|dkrnD| j d| j d| j krtd| j  |	dd}t
|dkr|d  r||d  n| rntdt || | j }qm|| _ |d|ddfS )rY   NzFooter format unexpected: ''rZ   r0   r   z(Unexpected content after SQ or CO line: T&Premature end of file in sequence dataBlank line in sequence datazDProblem with characters in header line,  or incorrect header width: r   r/   r   z&EMBL sequence line missing coordinatesr
   )r   r   r   r   r!   r'   r   r   r2   rsplitr   isdigitr3   r4   r   rN   rf   )r   rq   	seq_linesr   
linersplitr   r   r   r[   y  sT   

zEmblScanner.parse_footerc                 C   s   |d | j   dksJ || j d  ddkr!| || d S || j d  ddkrD| dr<| || d S | || d S || j d  ddkrX| || d S td| )NID;   r	   z SQr/   z&Did not recognise the ID line layout:
)	r   r   rI   _feed_first_line_newendswith_feed_first_line_patents_feed_first_line_old_feed_first_line_patents_kipor!   r]   r   r   r   r_     s   zEmblScanner._feed_first_linec                 C   sh   dd || j d   d d dD }t|dksJ ||d  ||d  ||d  d S )	Nc                 S      g | ]}|  qS r   r2   r<   rm   r   r   r   
<listcomp>  s    z8EmblScanner._feed_first_line_patents.<locals>.<listcomp>r      r   r   r/   )r   r2   r5   r   locusresidue_typedata_file_divisionr   r^   r   fieldsr   r   r   r     s   z$EmblScanner._feed_first_line_patentsc                 C   s   |d | j   dksJ || j d  d dd g}||| j d  d dd d dd |D }	 ||d  | ||d  d S )Nr   r   r   r   c                 S   r   r   r   r<   entryr   r   r   r         z=EmblScanner._feed_first_line_patents_kipo.<locals>.<listcomp>r	   )r   r   r5   extendr   _feed_seq_lengthr   r   r   r   r     s   &
z)EmblScanner._feed_first_line_patents_kipoc                 C   s   |d | j   dksJ || j d  d dd g}||| j d  d dd d dd |D }	 ||d  ||d  d|d v r]|d ||d dd		  n"d
|d v rv|d
 ||d d
d		  n	||d 	  |
|d  | ||d  d S )Nr   r   r   r   c                 S   r   r   r   r   r   r   r   r     r   z4EmblScanner._feed_first_line_old.<locals>.<listcomp>r/   circularr
   linearr	   r   )r   r   r5   r   r   r   topologyr   rf   r2   r   r   r   r   r   r   r     s    &

z EmblScanner._feed_first_line_oldc                 C   s   |d | j   dksJ dd || j d   dD }t|dks&J 	 ||d  ||d  |d  }t|dkrT|d d	krT|d  rT||d  |	d

|dd  ||d  ||d  ||d  | ||d  d S )Nr   c                 S   r   r   r   r   r   r   r   r     r   z4EmblScanner._feed_first_line_new.<locals>.<listcomp>r      r   r   r/   SVr0   r   r	   r   r   )r   r   r2   r5   r   r   	accessionr   version_suffixr   rN   r   r   r   r   r   r^   r   r   version_partsr   r   r   r     s"   "
z EmblScanner._feed_first_line_newc                 C   sH   |  }t|dksJ d||d  dv sJ | |d  d S )Nr/   zInvalid sequence length string r   )BPzBP.AAzAA.r   )r5   r   uppersize)r^   textlength_partsr   r   r   r   ?  s   zEmblScanner._feed_seq_lengthc           
   	   C   sZ  ddddddddd	}|D ]}|d | j   }|| j d   }|d
kr'q|dkrC|d dkr=|d dkr=|dd }|| q|dkre| dkrNqdd |dD }|dd| d q|dkr|drt|dd  }|dr|d d }|| q|dkr|dd\}}	|	dr|	d d }	|	 }	|d kr|	|	 q|d!kr|
|g q|d"kr|dd}t|dkrtd#t q||d   d$|d    q|d%kr||d q|d&kr|d'r||d q|d(kr||d q||v rt||| | q| jr*td)|  qd S )*Nr   version
definitionconsrtmjournalorganismtaxonomycomment)ACr   DERGRLOSOCCCr   RNr   [r?   ]r   RPz[-]c                 S   s$   g | ]}|  r|d d  qS )-z to )r2   rf   )r<   basesr   r   r   r   o  s    z2EmblScanner._feed_header_lines.<locals>.<listcomp>r@   z(bases z; rB   RTrE   z";RXr   .PUBMEDr   DRzMalformed DR line in EMBL file.rF   RAPRzProject:KWzIgnoring EMBL header line:
)EMBL_INDENTr2   reference_numr5   reference_basesrN   rK   r   title	pubmed_idr   r   r   r3   r4   r   dblinkauthorsprojectkeywordsgetattrr   r   )
r   r^   rP   consumer_dictr   	line_typerm   partsrV   rW   r   r   r   rb   F  s|   




&


zEmblScanner._feed_header_linesc                 C   s   | d t|}zU|D ]O}|drA|dd   }|}	 t|}|s%n|dr5||dd   7 }ntd| q|| |dr[| ||dd   d	dd	d
  qW d S  t
yj   tdd w )Nr
   zCO   r   Tz-Expected CO (contig) continuation line, got:
zSQ   Sequence    r   r   r   %Problem in misc lines before sequence)r'   iterrK   r2   rH   r!   contig_locationr   r   r5   rO   r   r^   rP   	line_iterr   r   r   r   r   rl     s6   




"
zEmblScanner._feed_misc_linesN)r   r   r   r   r   r   r&   r1   r   r   r   r   EMBL_SPACERr[   r_   r   r   r   r   r   r   rb   rl   r   r   r   r   r   k  s,    4!0
 r   c                   @   s*   e Zd ZdZg dZdd Zd	ddZdS )
_ImgtScannera  For extracting chunks of information in IMGT (EMBL like) files (PRIVATE).

    IMGT files are like EMBL files but in order to allow longer feature types
    the features should be indented by 25 characters not 21 characters. In
    practice the IMGT flat files tend to use either 21 or 25 characters, so we
    must cope with both.

    This is private to encourage use of Bio.SeqIO rather than Bio.GenBank.
    )r   z4FH   Key             Location/Qualifiers (from EMBL)z,FH   Key                 Location/Qualifiersr   c                 C   sf  |d | j   dksJ || j d  ddkr t| ||S dd || j d   dD }t|dks9J 	 ||d  |d  }t|d	kr`|d d
kr`|d 	 r`|
|d  ||d  d|d v r|d ||d dd  n"d|d v r|d ||d dd  n	||d   ||d  | ||d  d S )Nr   r   r   c                 S   r   r   r   r   r   r   r   r     r   z1_ImgtScanner._feed_first_line.<locals>.<listcomp>r   r   r   r/   r   r	   r   r
   r   r   )r   r   rI   r   r_   r2   r5   r   r   r   r   r   r   r   rf   r   r   r   r   r   r   r_     s.   "



z_ImgtScanner._feed_first_lineFc           
      C   sh  | j  | jvr| jrtd g S | j  | jv r'| j | _ | j  | jv std}g }| j }	 |s8t	d|d| j
  | jv rL| jrKtd n| }|dkrXt	d|| jv rj| jrdtd	 | j }n|d
| j  dkr{| j }q1|r| j }|d| j | jkr| j }|d| j | jksn|dd
 dksJ z|d
d   \}}W n t	y   |d
d  }|dd  }Y nw |g}| j }|d| j | jks| dkr|dd
 dksJ ||| jd   | j }|d| j | jks| dks| ||\}}}	d|v r&|d|}||||	f q2|| _ |S )r*   r+   z	([0-9]+)>Tr,   Nr%   r   r-   r.   r/   r
   r      >z>\1)r   r   r&   r   r   r   r   recompiler!   r   r   r1   r   r2   r   r5   r'   r6   sub)
r   r7   bad_position_rer8   r   r9   location_startr:   rd   rS   r   r   r   r;   3  s   










?z_ImgtScanner.parse_featuresNr   )r   r   r   r   r&   r_   r;   r   r   r   r   r    s
    
6r  c                   @   s~   e Zd ZU dZdZdZddgZg Zee	 e
d< dZde Zg d	ZeZde Zd
ZdZdZdd Zdd Zdd Zdd ZdS )GenBankScannerz6For extracting chunks of information in GenBank files.LOCUS          z(FEATURES             Location/QualifiersFEATURESr1   r   r0   )CONTIGORIGIN
BASE COUNTWGSTSATLSz-START##z-END##z :: c                 C   s  | j d| j  | jvrtd| j  dg }| j d| j  | jv s;| j d| j d| j ks;d| j dd krs|| j   | j | _ | j sPtd| j d| j  | jv s;| j d| j d| j ks;d| j dd ks;| j d| j  | jv rtd| j  dg }| j }	 |st	d
t
 d}n\| }|st	dt
 | j }q|dkrnE|drn?t|dkr|dd dkrt	dt
 |dd }t|dkr|dd dkrtd| d||dd  | j }q|| _ |d|ddfS )rY   NzFooter format unexpected:  'r   r0   r  r	   rZ   zEh? 'Tr   r   r   r  	   
   z%Invalid indentation for sequence liner   zSequence line mal-formed, 'r
   )r   r   r   r   r!   r'   r   r   r3   r4   r   rK   r   rN   rf   )r   rq   r   r   r   r   r   r[     sZ   


zGenBankScanner.parse_footerc           
      C   s  |d| j  dkrtd| |dd dv r|dd d	kr|d
d dkr/td| |dd  dvr?td| |dd dkrMtd| |dd  rq|dd dkrctd| |dd dkrqtd| || j d }d|v r|dd}d|v s||d}t|dkrtd| t|dkrtd| |\}}t|d krtd!t |	| |
| |dd  d"kr|dd d#kr|d$ n||dd   ||dd
   ||dd   ||dd  |dd  r||dd  d>S d>S |d%d& dv r|d'd  dv rt|d(k rGtd)|t d(t| }d| }||7 }|d%d& dvrVtd*| |d&d+ d,vretd-| |d+d'  d"ksd.|d+d'   v sd/|d+d'   v std0| |d'd dkrtd1| |dd2  dvrtd| |d2d dkrtd3| |d4d dkrtd5| |dd(  r|d6d7 dkrtd8| |d9d: dkrtd;| || j d% }d|v r|dd}d|v s|d}t|dkrtd| t|dkr+td| |	|d  |
|d  |d&d'  d"kr[|d%d& d#kr[|d<|d'd2    n||d&d2   ||d&d'   ||dd2   |dd=  r||dd4  |dd(  r||dd(  d>S d>S || j d>  ddkr|| j d>  d"kr|	|| j d>   d>S td?|t d>S t| d@krS| dA dBv rS| dC dDv rS| }	|	|	d  t|	d tjkrtdE|	d tjf |
|	d  ||	dF  ||	dC  ||	dG  ||	dH  t|dIk rQtdJ||	d |	d |	dF f t d>S d>S t| dHkr| dA dBv r| }	|	|	d  |
|	d  ||	dF  ||	dC  ||	dG  d>S t| dFkr| dA dBv rtdK|t |	| d  |
| d  d>S t| dFkr| dL dBv rtdK|t |	|dCd> d>dd   |
| dM  d>S tdN| )Oa  Scan over and parse GenBank LOCUS line (PRIVATE).

        This must cope with several variants, primarily the old and new column
        based standards from GenBank. Additionally EnsEMBL produces GenBank
        files where the LOCUS line is space separated rather that following
        the column based layout.

        We also try to cope with GenBank like files with partial LOCUS lines.

        As of release 229.0, the columns are no longer strictly in a given
        position. See GenBank format release notes:

            "Historically, the LOCUS line has had a fixed length and its
            elements have been presented at specific column positions...
            But with the anticipated increases in the lengths of accession
            numbers, and the advent of sequences that are gigabases long,
            maintaining the column positions will not always be possible and
            the overall length of the LOCUS line could exceed 79 characters."

        r   r  z%LOCUS line does not start correctly:
   !   )z bp  aa z rc 7   >   z       )   *   r0   z2LOCUS line does not contain space at position 42:
3   )r
   r   r   zALOCUS line does not contain valid entry (linear, circular, ...):
4   z2LOCUS line does not contain space at position 52:
I   @   A   r   z6LOCUS line does not contain - at position 65 in date:
D   E   z6LOCUS line does not contain - at position 69 in date:
r   r/   z4Cannot parse the name and length in the LOCUS line:
r   z+Name and length collide in the LOCUS line:
   z0GenBank LOCUS line identifier over 16 charactersr
   r  PROTEIN(   ,   6   O   z/Truncated LOCUS line found - is this correct?
:z=LOCUS line does not contain size units at expected position:
/   )z   zss-zds-zms-zCLOCUS line does not have valid strand type (Single stranded, ...):
DNARNAzALOCUS line does not contain valid sequence type (DNA, RNA, ...):
z2LOCUS line does not contain space at position 55:
?   z2LOCUS line does not contain space at position 64:
C   z2LOCUS line does not contain space at position 68:
F   G   z6LOCUS line does not contain - at position 71 in date:
J   K   z6LOCUS line does not contain - at position 75 in date:
zPROTEIN L   Nz-Minimal LOCUS line found - is this correct?
:   r	   )aabpr   )r   r   zlTried to load a sequence with a length %s, your installation of python can only load sesquences of length %sr   r   r   P   zmAttempting to parse malformed locus line:
%r
Found locus %r size %r residue_type %r
Some fields may be wrong.z/Malformed LOCUS line found - is this correct?
:r?   r   z)Did not recognise the LOCUS line layout:
)GENBANK_INDENTr!   r2   rf   r5   r   r3   r4   r   r   r   r   r   r   r   dater   rI   r    sysmaxsizer   )
r   r^   r   name_and_length_strname_and_lengthr|   lengthpadding_lenpadding	splitliner   r   r   r_     s  $


$(


($	$$"zGenBankScanner._feed_first_linec                 C   s  i ddddddddd	d
dddddddddddddddddddddd }d!d" |D }| d# t|}z!t|}	 |sNW d S |d | j  }|| jd   }|d%krd&|v rr|d&d'}d&|v shd(|vr||| n-| jrtd)|	d(d*  d+ |	d(d,  d-  ||	d(d*  |
|	d(d,  t|}n|d.kr||  	 t|}|d | j | jkr||| jd    nnqn|d/kr| jd,krtd0| d-  | }	 t|}|d | j | jkr|d'|| jd   7 }| jd,krtd1| d-  nnqd&|v r,|d&d'}d&|v s!d'|vrF| jd2kr?td3| d4  || n| jd2krhtd3|d |d'  d5 ||d'd, d   d4  ||d |d'  |||d'd, d   n|d6kr|}d#}		 t|}|d*| j | jkr|	sd7|v s|| jd   d8v r|	d'|| jd   7 }	n|| jd   d9krn|d'|| jd    7 }nnq|| |	 d#kr| jd,krtd: ||	  ~~	nh|d;kr|| jd  }| jd,krtd< g }
tt}d=| j d>}t||}|d ur:|d,}| jd,kr9td? n|
 | 	 t|}|| jd  }|d*| j | jkr | j|v rzd=| j d>}t||}|d urt|d,}n|
 | n|d ur| j |v rtd@| j  dA|}|d2|| |d,< | jd2krtdB| d-  nR|d ur| j|vr||vrtdC| t q?|| |d, }|d' |  || |d,< n| j|v rd }n|
 | | jd2krtdD| d-  nnq@|
r||
 |r|| ~
~~nM||v rV	 t|}|d*| j | jkr8|d'|| jd   7 }n|dkrI|d9rI|d dE }t ||| | nqn| jr`tdF|  t|}qI t!yq   t"dGd w )HN
DEFINITIONr   	ACCESSIONr   NIDnidPIDpidDBSOURCE	db_sourceKEYWORDSr   SEGMENTsegmentSOURCEsourceAUTHORSr   CONSRTMr   PROJECTr   TITLEr   JOURNALr   MEDLINE
medline_idr   r   REMARKremarkc                 S   s   g | ]}|r|qS r   r   )r<   _fr   r   r   r   ?  r   z5GenBankScanner._feed_header_lines.<locals>.<listcomp>r
   TVERSIONr   r0   z GI:z	Version [r   z], gi [r   r   DBLINK	REFERENCEzFound reference [zExtended reference text [r/   zReference number "rE   z", "ORGANISMr   )z	Bacteria.zArchaea.z
Eukaryota.zUnclassified.zViruses.zcellular organisms.zother sequences.zunclassified sequences.r   z!Taxonomy line(s) missing or blankCOMMENTzFound commentz([^#]+)$zFound Structured Commentz(.+?)\s*z\s*(.*)z!Structured Comment continuation [z8Structured comment not parsed on malformed header line: zComment continuation [r?   zIgnoring GenBank header line:
zProblem in header)#r'   r   rH   r8  r2   rf   r   r   r   r5   gir   GENBANK_SPACERr   rL   r   r   r   r   dictSTRUCTURED_COMMENT_STARTr  searchgroupSTRUCTURED_COMMENT_DELIMSTRUCTURED_COMMENT_ENDr3   r4   r   r   structured_commentr   r   rO   r!   )r   r^   rP   r   r   r   r   rm   organism_datalineage_datacomment_liststructured_comment_dictregexstructured_comment_keymatchprevious_value_liner   r   r   rb   !  s  	






 
	











<


	  w
z!GenBankScanner._feed_header_linesc                 C   s  | d t|}z|D ]}|dr+|dd   }|r+| jr&td|  || |drH|dd   }|rH| jrCtd|  || |drZ|d	d   }|| |d
rl|d	d   }|	| |dr~|d	d   }|
| |dr|dd   }|| |dr|dd   }|}	 t|}|sn2|d | j | jkr||| jd   7 }n|dr|dd   }|r|| ntd| q|| qW d S  ty   tdd w )Nr
   r  r  zbase_count = r  r   zorigin_name = zTLS r	   zTSA zWGS 
WGS_SCAFLDr  Tz(Expected CONTIG continuation line, got:
r   )r'   r   rK   r2   r   r   
base_countorigin_nametlstsawgsadd_wgs_scafldrH   r8  r`  r   r!   r   rO   r   r   r   r   rl   =  sh   

















zGenBankScanner._feed_misc_linesN)r   r   r   r   r   r   r&   r1   liststr__annotations__r   r   r   r8  r`  rb  rf  re  r[   r_   rb   rl   r   r   r   r   r
    s,   
 	6  H  r
  )r   r  r:  r3   collectionsr   Bior   Bio.Filer   Bio.Seqr   Bio.SeqRecordr   r   r   r  r
  r   r   r   r   <module>   s*       F    "