Release Notes
v2.5.0 series come with new and improved sequence, structure, and dynamics analysis features. See release notes for details.
How to Cite
Bakan A, Meireles LM, Bahar I ProDy: Protein Dynamics Inferred from Theory and Experiments
Bioinformatics 2011 27(11):1575-1577.
Bakan A, Dutta A, Mao W, Liu Y, Chennubhotla C, Lezon TR, Bahar I Evol and ProDy for Bridging Protein Sequence Evolution and Structural Dynamics
Bioinformatics 2014 30(18):2681-2683.
Zhang S, Krieger JM, Zhang Y, Kaya C, Kaynak B, Mikulska-Ruminska K, Doruker P, Li H, Bahar I ProDy 2.0: Increased scale and scope after 10 years of protein dynamics modelling with Python
Bioinformatics 2021 37(20):3657-3659.
Source code for prody.database.pfam

# -*- coding: utf-8 -*-
"""This module defines functions for interfacing Pfam database."""

__author__ = 'Anindita Dutta, Ahmet Bakan, Cihan Kaya, James Krieger'

import re
from numbers import Integral

import numpy as np
from os.path import join
from io import BytesIO

from prody import LOGGER, PY3K
from prody.utilities import makePath, gunzip
from prody.utilities import relpath
from prody.proteins import parsePDB

import json

__all__ = ['searchPfam', 'fetchPfamMSA', 'parsePfamPDBs']

FASTA = 'fasta'
SELEX = 'selex'
STOCKHOLM = 'stockholm'

DOWNLOAD_FORMATS = set(['seed', 'full', 'uniprot', 
                        #'ncbi', 'metagenomics',
                        #'rp15', 'rp35', 'rp55', 'rp75'
                        ])
FORMAT_OPTIONS = ({'format': set([FASTA, SELEX, STOCKHOLM]),
                  'order': set(['tree', 'alphabetical']),
                  'inserts': set(['lower', 'upper']),
                  'gaps': set(['mixed', 'dots', 'dashes', 'none'])})

prefix = 'https://www.ebi.ac.uk/interpro/wwwapi/entry/'

[docs]def searchPfam(query, **kwargs):
    """Returns Pfam search results in a dictionary.  Matching Pfam accession
    as keys will map to evalue, alignment start and end residue positions.

    :arg query: UniProt ID or PDB identifier with or without a
        chain identifier, e.g. ``'1mkp'`` or ``'1mkpA'``.  
        UniProt ID of the specified chain, or the first
        protein chain will be used for searching the Pfam database
    :type query: str

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60
    :type timeout: int
    """

    import requests

    seq = ''.join(query.split())

    LOGGER.timeit('_pfam')
    timeout = int(kwargs.get('timeout', 60))

    if len(seq) <= 5:
        accession = None
        from prody import parsePDBHeader
        try:
            polymers = parsePDBHeader(seq[:4], 'polymers')
        except Exception as err:
            raise ValueError('failed to parse header for {0} ({1})'
                                .format(seq[:4], str(err)))
        else:
            chid = seq[4:].upper()

        for poly in polymers:
            if chid and poly.chid != chid:
                continue
            for dbref in poly.dbrefs:
                if dbref.database != 'UniProt':
                    continue
                accession = dbref.accession
                LOGGER.info('UniProt accession {0} for {1} chain '
                            '{2} will be used.'
                            .format(accession, seq[:4], poly.chid))
                break
            if accession is not None:
                break
        if accession is None:
            raise ValueError('A UniProt accession for PDB {0} could not be '
                                'parsed.'.format(repr(seq)))
        else:
            url = prefix + "all/protein/uniprot/" + accession

    else:
        url = prefix + "all/protein/uniprot/" + seq

    LOGGER.debug('Retrieving Pfam search results: ' + url)
    xml = None
    sleep = 2
    while LOGGER.timing('_pfam') < timeout:
        try:
            xml = requests.get(url, verify=False).content
        except Exception:
            pass
        else:
            if xml not in ['PEND','RUN']:
                break
        
        sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
        LOGGER.sleep(int(sleep), '. Trying to reconnect...')

    if not xml:
        raise IOError('Pfam search timed out or failed to parse results '
                      'XML, check URL: ' + url)
    else:
        LOGGER.report('Pfam search completed in %.2fs.', '_pfam')

    if PY3K:
        xml = xml.decode()
    else:
        xml = xml.encode()

    if xml.find('There was a system error on your last request.') > 0:
        LOGGER.warn('No Pfam matches found for: ' + seq)
        return None
    elif xml.find('No valid UniProt accession or ID') > 0:
        raise ValueError('No valid UniProt accession or ID for: ' + seq)

    try:
        root = json.loads(xml)
    except Exception as err:
        raise ValueError('failed to parse results XML, check URL: ' + url)

    matches = dict()
    for entry in root["results"]:
        try:
            metadata = entry["metadata"]
            accession = metadata["accession"]
            if isinstance(metadata["member_databases"], dict):
                accessions2 = [list(value.keys())
                               for value in metadata["member_databases"].values()]
            else:
                accessions2 = []
        except KeyError:
            raise ValueError('failed to parse accessions from results, check URL: ' + url)

        pfamAccessions = []
        if re.search('PF[0-9]{5}$', accession):
            pfamAccessions.append(accession)
        else:
            for accession in np.array(accessions2).flatten():
                if (re.search('PF[0-9]{5}$', accession) and
                    entry["proteins"][0]["entry_protein_locations"] is not None):
                    pfamAccessions.append(accession)

        if len(pfamAccessions) == 0:
            continue

        for accession in pfamAccessions:
            match = matches.setdefault(accession, dict(metadata.items()))
            
            other_data = entry["proteins"]
            locations = match.setdefault("locations", [])
            for item1 in other_data:
                for key, value in item1.items():
                    if key == "entry_protein_locations":
                        for item2 in value:
                            new_dict = {}
                            for item3 in value[0]["fragments"]:
                                new_dict["start"] = item3["start"]
                                new_dict["end"] = item3["end"]
                                new_dict["score"] = item2["score"]
                                locations.append(new_dict)    

    query = 'Query ' + repr(query)

    if matches:
        LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches)))
    else:
        LOGGER.info(query + ' did not match any Pfam families.')
    return matches


[docs]def fetchPfamMSA(acc, alignment='seed', compressed=False, **kwargs):
    """Returns a path to the downloaded Pfam MSA file.

    :arg acc: Pfam ID or Accession Code
    :type acc: str

    :arg alignment: alignment type, one of ``'full'``, ``'seed'`` (default),
         ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``,
         ``'rp75'`` or ``'uniprot'`` where rp stands for representative 
         proteomes. InterPro Pfam seems to only have seed alignments
         easily accessible in most cases

    :arg compressed: gzip the downloaded MSA file, default is **False**

    :arg timeout: timeout for blocking connection attempt in seconds, default
        is 60

    :arg outname: out filename, default is input ``'acc_alignment.format'``
    :type outname: str

    :arg folder: output folder, default is ``'.'``
    :type folder: str
    """
    
    import requests

    if not re.search('(?<=PF)[0-9]{5}$', acc):
        raise ValueError('{0} is not a valid Pfam ID or Accession Code'
                         .format(repr(acc)))

    if alignment not in DOWNLOAD_FORMATS:
        raise ValueError('alignment must be one of full, seed, or uniprot')

    url = (prefix + "/pfam/" + acc + 
            '/?annotation=alignment:' + alignment + '&download')
    extension = '.sth'

    LOGGER.timeit('_pfam')
    timeout = kwargs.get('timeout', 60)
    response = None
    sleep = 2
    while LOGGER.timing('_pfam') < timeout:
        try:
            response = requests.get(url, verify=False).content
        except Exception:
            pass
        else:
            break
        
        sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5)
        LOGGER.sleep(int(sleep), '. Trying to reconnect...')

    outname = kwargs.get('outname', None)
    if not outname:
        outname = acc
    folder = str(kwargs.get('folder', '.'))
    filepath = join(makePath(folder), outname + '_' + alignment + extension)
    if compressed:
        filepath = filepath + '.gz'
        f_out = open(filepath, 'wb')
        f_out.write(response)
        f_out.close()
    else:
        gunzip(response, filepath)

    filepath = relpath(filepath)
    LOGGER.info('Pfam MSA for {0} is written as {1}.'
                .format(acc, filepath))

    return filepath

[docs]def parsePfamPDBs(query, data=[], **kwargs):
    """Returns a list of :class:`.AtomGroup` objects containing sections of chains 
    that correspond to a particular PFAM domain family. These are defined by 
    alignment start and end residue numbers.

    :arg query: Pfam ID, UniProt ID or PDB ID
        If a PDB ID is provided the corresponding UniProt ID is used.
        If this returns multiple matches then start or end must also be provided.
        This query is also used for label refinement of the Pfam domain MSA.
    :type query: str

    :arg data: If given the data list from the Pfam mapping table will 
        be output through this argument.
    :type data: list

    :keyword start: Residue number for defining the start of the domain.
        The PFAM domain that starts closest to this will be selected. 
        Default is **1**
    :type start: int

    :keyword end: Residue number for defining the end of the domain.
        The PFAM domain that ends closest to this will be selected. 
    :type end: int
    """

    only_parse = kwargs.pop('only_parse', False)
    
    start = kwargs.pop('start', 1)
    end = kwargs.pop('end', None)

    if len(query) > 4 and query.startswith('PF'):
        pfam_acc = query
    else:
        if not isinstance(start, Integral) and not isinstance(end, Integral):
            raise ValueError('Please provide an integer for start or end '
                             'when using a UniProt ID or PDB ID.')

        pfam_matches = searchPfam(query, **kwargs)
        keys = list(pfam_matches.keys())

        if isinstance(start, Integral):
            try:
                start_diff = []
                for i, key in enumerate(pfam_matches):
                    start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start)
                start_diff = np.array(start_diff)
                pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]]
            except KeyError:
                start_diff = []
                for i, key in enumerate(pfam_matches):
                    start_diff.append(int(pfam_matches[key]['locations']['ali_start']) - start)
                start_diff = np.array(start_diff)
                pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]]

        if isinstance(end, Integral):
            end_diff = []
            for i, key in enumerate(pfam_matches):
                end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end)
            end_diff = np.array(end_diff)
            pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]]

    from ftplib import FTP
    from .uniprot import queryUniprot

    data_stream = BytesIO()
    ftp_host = 'ftp.ebi.ac.uk'
    ftp = FTP(ftp_host)
    ftp.login()
    ftp.cwd('pub/databases/Pfam/current_release')
    ftp.retrbinary('RETR pdbmap.gz', data_stream.write)
    ftp.quit()
    zip_data = data_stream.getvalue()
    data_stream.close()

    rawdata = gunzip(zip_data)
    if PY3K:
        rawdata = rawdata.decode()

    fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 
              'UniprotAcc', 'UniprotResnumRange']
    
    data_dicts = []
    for line in rawdata.split('\n'):
        if line.find(pfam_acc) != -1:
            data_dicts.append({})
            for j, entry in enumerate(line.strip().split('\t')):
                data_dicts[-1][fields[j]] = entry.strip(';')

    pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts]
    chains = [data_dict['chain'] for data_dict in data_dicts]

    header = kwargs.pop('header', False)
    model = kwargs.get('model', None)
    results = parsePDB(pdb_ids, chain=chains, header=True, **kwargs)

    ags, headers = results
    ags, headers = list(ags), list(headers)

    if model == 0:
        LOGGER.info('only header is requested and returned')
        return results

    if header:
        results = (ags, headers)
    else:
        results = ags

    if only_parse:
        return [result for result in results if result is not None]

    LOGGER.progress('Extracting Pfam domains...', len(ags))
    comma_splitter = re.compile(r'\s*,\s*').split
    no_info = []
    for i, ag in enumerate(ags):
        if ag is None:
            continue

        LOGGER.update(i)
        data_dict = data_dicts[i]
        pfamRange = data_dict['UniprotResnumRange'].split('-')
        uniprotAcc = data_dict['UniprotAcc']
        try:
            uniData = queryUniprot(uniprotAcc)
        except:
            LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PDB_ID']))
            continue

        resrange = None
        found = False
        for key, value in uniData.items():
            if not key.startswith('dbReference'):
                continue
            try:
                pdbid = value['PDB']
            except:
                continue
            if pdbid.lower() != data_dict['PDB_ID'].lower():
                continue
            pdbchains = value['chains']

            # example chain strings: "A=27-139, B=140-150" or "A/B=27-150"
            pdbchains = comma_splitter(pdbchains)
            for chain in pdbchains:
                chids, resrange = chain.split('=')
                chids = [chid.strip() for chid in chids.split('/')]
                if data_dict['chain'] in chids:
                    resrange = resrange.split('-')
                    found = True
                    break
            if found:
                break

        if found:
            header = headers[i]
            chain_accessions = [dbref.accession 
                                for dbref in header[data_dict['chain']].dbrefs]
            try:
                if len(chain_accessions) > 0:
                    right_part = np.where(np.array(chain_accessions) == 
                                        data_dict['UniprotAcc'])[0][0]
                else:
                    raise ValueError('There is no accession for a chain in the Header')
            except:
                LOGGER.warn('Could not map domains in {0}'
                            .format(data_dict['PDB_ID'] 
                            + data_dict['chain']))
                no_info.append(i)
                continue

            right_dbref = header[data_dict['chain']].dbrefs[right_part]
            chain = ag.select('chain {0}'.format(data_dict['chain']))
            if chain is None:
                continue
            chainStart = chain.getResnums()[0]
            missing = chainStart - right_dbref.first[0]
            partStart = ag.getResindices()[np.where(ag.getResnums() == 
                                           right_dbref.first[0] + missing)][0]
            pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1])
            uniStart, uniEnd = int(resrange[0]), int(resrange[1])

            resiStart = pfStart - uniStart + partStart - missing
            resiEnd = pfEnd - uniStart + partStart - missing
            ags[i] = ag.select('resindex {0} to {1}'.format(
                            resiStart, resiEnd)) 
        else:
            no_info.append(i)
    LOGGER.finish()

    for i in reversed(no_info):
        ags.pop(i)
        if header:
            headers.pop(i)

    if isinstance(data, list):
        data.extend(data_dicts)
    else:
        LOGGER.warn('data should be a list in order to get output')
    
    return [result for result in results if result is not None]