o
    Rŀg7(                  
   @   s   d Z ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddd	d	d	d
d
d
dd	Z	G dd deZ
G dd deZedkrMddlmZ edd dS dS )a  Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format.

This module is for reading and writing PIR or NBRF format files as
SeqRecord objects.

You are expected to use this module via the Bio.SeqIO functions, or if
the file contains a sequence alignment, optionally via Bio.AlignIO instead.

This format was introduced for the Protein Information Resource (PIR), a
project of the National Biomedical Research Foundation (NBRF).  The PIR
database itself is now part of UniProt.

The file format is described online at:
http://www.ebi.ac.uk/help/pir_frame.html
http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down)

An example file in this format would be::

  >P1;CRAB_ANAPL
  ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
    MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR
    SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH
    GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ
    SDVPERSIPI TREEKPAIAG AQRK*

  >P1;CRAB_BOVIN
  ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
    MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR
    PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV
    HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK
    QASGPERTIP ITREEKPAVT AAPKK*

Or, an example of a multiple sequence alignment::

  >P1;S27231
  rhodopsin - northern leopard frog
  MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY
  VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG
  GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP
  EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES
  ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI
  YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA*

  >P1;I51200
  rhodopsin - African clawed frog
  MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF
  VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG
  GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP
  EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES
  LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI
  YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA*

  >P1;JN0120
  rhodopsin - Japanese lamprey
  MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF
  VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG
  GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP
  EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES
  ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL
  YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA*


As with the FASTA format, each record starts with a line beginning with ">"
character.  There is then a two letter sequence type (P1, F1, DL, DC, RL,
RC, or XX), a semi colon, and the identification code.  The second like is
free text description.  The remaining lines contain the sequence itself,
terminating in an asterisk.  Space separated blocks of ten letters as shown
above are typical.

Sequence codes and their meanings:
 - P1 - Protein (complete)
 - F1 - Protein (fragment)
 - D1 - DNA (e.g. EMBOSS seqret output)
 - DL - DNA (linear)
 - DC - DNA (circular)
 - RL - RNA (linear)
 - RC - RNA (circular)
 - N3 - tRNA
 - N1 - Other functional RNA
 - XX - Unknown

    )Seq)	SeqRecord   )_get_seq_string)SequenceIterator)SequenceWriterproteinDNARNAN)	P1F1D1DLDCRLRCN3XXc                       s0   e Zd ZdZ fddZdd Zdd Z  ZS )PirIteratorzParser for PIR files.c                    s   t  j|ddd dS )a  Iterate over a PIR file and yield SeqRecord objects.

        source - file-like object or a path to a file.

        Examples
        --------
        >>> with open("NBRF/DMB_prot.pir") as handle:
        ...    for record in PirIterator(handle):
        ...        print("%s length %i" % (record.id, len(record)))
        HLA:HLA00489 length 263
        HLA:HLA00490 length 94
        HLA:HLA00491 length 94
        HLA:HLA00492 length 80
        HLA:HLA00493 length 175
        HLA:HLA01083 length 188

        tPir)modefmtN)super__init__)selfsource	__class__ C/var/www/html/myenv/lib/python3.10/site-packages/Bio/SeqIO/PirIO.pyr   q   s   zPirIterator.__init__c                 C   s   |  |}|S )z9Start parsing the file, and return a SeqRecord generator.)iterate)r   handlerecordsr   r   r    parse   s   
zPirIterator.parsec           	      c   s   |D ]
}|d dkr nqdS 	 |dd }|t vs!|d dkr%td|d	d  }|  }g }|D ]}|d dkrA n|| d
d q7d}d|}|d dkr^tdtt	|dd |||d}||j
d< t | r|t | |j
d< |V  |du rdS q)z)Iterate over the records in the PIR file.r   >NTr      ;zBRecords should start with '>XX;' where XX is a valid sequence type     *z5Sequences in PIR files should include a * terminator!)idnamedescriptionzPIR-typemolecule_type)_pir_mol_type
ValueErrorstripreadlineappendrstripreplacejoinr   r   annotations)	r   r"   linepir_type
identifierr/   linesseqrecordr   r   r    r!      sF   

zPirIterator.iterate)__name__
__module____qualname____doc__r   r$   r!   __classcell__r   r   r   r    r   n   s
    r   c                       s*   e Zd ZdZd fdd	Zdd Z  ZS )		PirWriterz Class to write PIR format files.<   Nc                    s<   t  | d| _|r|dk rtd|| _|| _|| _dS )a  Create a PIR writer.

        Arguments:
         - handle - Handle to an output file, e.g. as returned
           by open(filename, "w")
         - wrap - Optional line length used to wrap sequence lines.
           Defaults to wrapping the sequence at 60 characters
           Use zero (or None) for no wrapping, giving a single
           long line for the sequence.
         - record2title - Optional function to return the text to be
           used for the title line of each record.  By default
           a combination of the record.id, record.name and
           record.description is used.
         - code - Optional sequence code must be one of P1, F1,
           D1, DL, DC, RL, RC, N3 and XX. By default None is used,
           which means auto detection based on the molecule type
           in the record annotation.

        You can either use::

            handle = open(filename, "w")
            writer = PirWriter(handle)
            writer.write_file(myRecords)
            handle.close()

        Or, follow the sequential file writer system, for example::

            handle = open(filename, "w")
            writer = PirWriter(handle)
            writer.write_header() # does nothing for PIR files
            ...
            Multiple writer.write_record() and/or writer.write_records() calls
            ...
            writer.write_footer() # does nothing for PIR files
            handle.close()

        Nr   z-wrap should be None, 0, or a positive integer)r   r   wrapr2   record2titlecode)r   r"   rG   rH   rI   r   r   r    r      s   &
zPirWriter.__init__c           	   	   C   s  | j r| |  |}n| |j}|jr$|jr$| |jd |j }n|jr1|js1| |j}n| |j}| jr>| j}n$|jd}|du rKd}nd|v rRd}nd|v rYd}n	d	|v r`d
}nd}|tvrpt	dt
  d d|vsvJ d|vs|J | jd| d| d| d t|}d|vsJ d|vsJ | jrd}tdt|| jD ]}||||| j  d 7 }q|dd d }| j| dS | j|d  dS )z&Write a single PIR record to the file.z - r0   Nr   r	   r   r
   r   r   r   zSequence code must be one of .
r%   r'   r*   r   r+   z*
)rH   cleanr-   r.   r/   rI   r9   getr1   	TypeErrorkeysr"   writer   rG   rangelen)	r   r?   titler/   rI   r0   datar:   ir   r   r    write_record   sJ    zPirWriter.write_record)rF   NN)r@   rA   rB   rC   r   rW   rD   r   r   r   r    rE      s    /rE   __main__)run_doctest)verbose)rC   Bio.Seqr   Bio.SeqRecordr   
Interfacesr   r   r   r1   r   rE   r@   
Bio._utilsrY   r   r   r   r    <module>   s,   SLg