Source code for prody.database.pfam

# -*- coding: utf-8 -*-
"""This module defines functions for interfacing Pfam database."""

__author__ = 'Anindita Dutta, Ahmet Bakan, Cihan Kaya, James Krieger'

import re
from numbers import Integral

import numpy as np
from os.path import join
from io import BytesIO

from prody import LOGGER, PY3K
from prody.utilities import makePath, gunzip
from prody.utilities import relpath
from prody.proteins import parsePDB

import json

__all__ = ['searchPfam', 'fetchPfamMSA', 'parsePfamPDBs']

FASTA = 'fasta'
SELEX = 'selex'
STOCKHOLM = 'stockholm'

DOWNLOAD_FORMATS = set(['seed', 'full', 'uniprot', 
                        #'ncbi', 'metagenomics',
                        #'rp15', 'rp35', 'rp55', 'rp75'
                        ])
FORMAT_OPTIONS = ({'format': set([FASTA, SELEX, STOCKHOLM]),
                  'order': set(['tree', 'alphabetical']),
                  'inserts': set(['lower', 'upper']),
                  'gaps': set(['mixed', 'dots', 'dashes', 'none'])})

prefix = 'https://www.ebi.ac.uk/interpro/wwwapi/entry/'

[docs]def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID or PDB identifier with or without a chain identifier, e.g. ``'1mkp'`` or ``'1mkpA'``. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int """ import requests seq = ''.join(query.split()) LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) <= 5: accession = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: raise ValueError('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue accession = dbref.accession LOGGER.info('UniProt accession {0} for {1} chain ' '{2} will be used.' .format(accession, seq[:4], poly.chid)) break if accession is not None: break if accession is None: raise ValueError('A UniProt accession for PDB {0} could not be ' 'parsed.'.format(repr(seq))) else: url = prefix + "all/protein/uniprot/" + accession else: url = prefix + "all/protein/uniprot/" + seq LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None sleep = 2 while LOGGER.timing('_pfam') < timeout: try: xml = requests.get(url, verify=False).content except Exception: pass else: if xml not in ['PEND','RUN']: break sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) LOGGER.sleep(int(sleep), '. Trying to reconnect...') if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if PY3K: xml = xml.decode() else: xml = xml.encode() if xml.find('There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find('No valid UniProt accession or ID') > 0: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = json.loads(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for entry in root["results"]: try: metadata = entry["metadata"] accession = metadata["accession"] if isinstance(metadata["member_databases"], dict): accessions2 = [list(value.keys()) for value in metadata["member_databases"].values()] else: accessions2 = [] except KeyError: raise ValueError('failed to parse accessions from results, check URL: ' + url) pfamAccessions = [] if re.search('PF[0-9]{5}$', accession): pfamAccessions.append(accession) else: for accession in np.array(accessions2).flatten(): if (re.search('PF[0-9]{5}$', accession) and entry["proteins"][0]["entry_protein_locations"] is not None): pfamAccessions.append(accession) if len(pfamAccessions) == 0: continue for accession in pfamAccessions: match = matches.setdefault(accession, dict(metadata.items())) other_data = entry["proteins"] locations = match.setdefault("locations", []) for item1 in other_data: for key, value in item1.items(): if key == "entry_protein_locations": for item2 in value: new_dict = {} for item3 in value[0]["fragments"]: new_dict["start"] = item3["start"] new_dict["end"] = item3["end"] new_dict["score"] = item2["score"] locations.append(new_dict) query = 'Query ' + repr(query) if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
[docs]def fetchPfamMSA(acc, alignment='seed', compressed=False, **kwargs): """Returns a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'``, ``'seed'`` (default), ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, ``'rp75'`` or ``'uniprot'`` where rp stands for representative proteomes. InterPro Pfam seems to only have seed alignments easily accessible in most cases :arg compressed: gzip the downloaded MSA file, default is **False** :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :type outname: str :arg folder: output folder, default is ``'.'`` :type folder: str """ import requests if not re.search('(?<=PF)[0-9]{5}$', acc): raise ValueError('{0} is not a valid Pfam ID or Accession Code' .format(repr(acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError('alignment must be one of full, seed, or uniprot') url = (prefix + "/pfam/" + acc + '/?annotation=alignment:' + alignment + '&download') extension = '.sth' LOGGER.timeit('_pfam') timeout = kwargs.get('timeout', 60) response = None sleep = 2 while LOGGER.timing('_pfam') < timeout: try: response = requests.get(url, verify=False).content except Exception: pass else: break sleep = 20 if int(sleep * 1.5) >= 20 else int(sleep * 1.5) LOGGER.sleep(int(sleep), '. Trying to reconnect...') outname = kwargs.get('outname', None) if not outname: outname = acc folder = str(kwargs.get('folder', '.')) filepath = join(makePath(folder), outname + '_' + alignment + extension) if compressed: filepath = filepath + '.gz' f_out = open(filepath, 'wb') f_out.write(response) f_out.close() else: gunzip(response, filepath) filepath = relpath(filepath) LOGGER.info('Pfam MSA for {0} is written as {1}.' .format(acc, filepath)) return filepath
[docs]def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of :class:`.AtomGroup` objects containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: Pfam ID, UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ only_parse = kwargs.pop('only_parse', False) start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: if not isinstance(start, Integral) and not isinstance(end, Integral): raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') pfam_matches = searchPfam(query, **kwargs) keys = list(pfam_matches.keys()) if isinstance(start, Integral): try: start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append(int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]] except KeyError: start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append(int(pfam_matches[key]['locations']['ali_start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where(abs(start_diff) == min(abs(start_diff)))[0][0]] if isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append(int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where(abs(end_diff) == min(abs(end_diff)))[0][0]] from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = ['PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange'] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: results = ags if only_parse: return [result for result in results if result is not None] LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): if ag is None: continue LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format(data_dict['PDB_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid.lower() != data_dict['PDB_ID'].lower(): continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [dbref.accession for dbref in header[data_dict['chain']].dbrefs] try: if len(chain_accessions) > 0: right_part = np.where(np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError('There is no accession for a chain in the Header') except: LOGGER.warn('Could not map domains in {0}' .format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chain = ag.select('chain {0}'.format(data_dict['chain'])) if chain is None: continue chainStart = chain.getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where(ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return [result for result in results if result is not None]