# -*- coding: utf-8 -*-
"""This module defines functions for parsing and writing `PDB files`_.
.. _PDB files: http://www.wwpdb.org/documentation/format32/v3.2.html"""
from collections import defaultdict
import os.path
import time
from numbers import Integral
import numpy as np
from prody.atomic import AtomGroup, Atom, Selection
from prody.atomic import flags, AAMAP
from prody.atomic import ATOMIC_FIELDS
from prody.utilities import openFile, isListLike
from prody.utilities.misctools import decToHybrid36, hybrid36ToDec, packmolRenumChains
from prody import LOGGER, SETTINGS
from .header import getHeaderDict, buildBiomolecules, assignSecstr, isHelix, isSheet
from .localpdb import fetchPDB
from .ciffile import parseMMCIF
from .emdfile import parseEMD
__all__ = ['parsePDBStream', 'parsePDB', 'parseChainsList', 'parsePQR',
'writePDBStream', 'writePDB', 'writeChainsList', 'writePQR',
'writePQRStream']
MAX_N_ATOM = 99999
MAX_N_RES = 9999
class PDBParseError(Exception):
pass
_parsePQRdoc = """
:arg title: title of the :class:`.AtomGroup` instance, default is the
PDB filename or PDB identifier
:type title: str
:arg ag: :class:`.AtomGroup` instance for storing data parsed from PDB
file, number of atoms in *ag* and number of atoms parsed from the PDB
file must be the same and atoms in *ag* and those in PDB file must
be in the same order. Non-coordinate data stored in *ag* will be
overwritten with those parsed from the file.
:type ag: :class:`.AtomGroup`
:arg chain: chain identifiers for parsing specific chains, e.g.
``chain='A'``, ``chain='B'``, ``chain='DE'``, by default all
chains are parsed
:type chain: str
:arg subset: a predefined keyword to parse subset of atoms, valid keywords
are ``'calpha'`` (``'ca'``), ``'backbone'`` (``'bb'``), or **None**
(read all atoms), e.g. ``subset='bb'``
:type subset: str
"""
_parsePDBdoc = _parsePQRdoc + """
:arg model: model index or None (read all models), e.g. ``model=10``
:type model: int, list
:arg header: if **True** PDB header content will be parsed and returned
:type header: bool
:arg altloc: if a location indicator is passed, such as ``'A'`` or ``'B'``,
only indicated alternate locations will be parsed as the single
coordinate set of the AtomGroup, if *altloc* is set ``'all'`` then all
alternate locations will be parsed and each will be appended as a
distinct coordinate set, default is ``"A"``
:type altloc: str
:arg biomol: if **True**, biomolecules are obtained by transforming the
coordinates using information from header section will be returned.
This option uses :func:`.buildBiomolecules` and as noted there,
atoms in biomolecules are ordered according to the original chain
IDs. Chains may have the same chain ID, in which case they are given
different segment names.
Default is **False**
:type biomol: bool
:arg secondary: if **True**, secondary structure information from header
section will be assigned to atoms.
Default is **False**
:type secondary: bool
If ``model=0`` and ``header=True``, return header dictionary only.
"""
_PDBSubsets = {'ca': 'ca', 'calpha': 'ca', 'bb': 'bb', 'backbone': 'bb'}
[docs]def parsePDB(*pdb, **kwargs):
"""Returns an :class:`.AtomGroup` and/or dictionary containing header data
parsed from a PDB file.
This function extends :func:`.parsePDBStream`.
See :ref:`parsepdb` for a detailed usage example.
:arg pdb: one PDB identifier or filename, or a list of them.
If needed, PDB files are downloaded using :func:`.fetchPDB()` function.
You can also provide arguments that you would like passed on to fetchPDB().
:arg extend_biomol: whether to extend the list of results with a list
rather than appending, which can create a mixed list,
especially when biomol=True.
Default value is False to reproduce previous behaviour.
This value is ignored when result is not a list (header=True or model=0).
:type extend_biomol: bool
Please note that resnames are only taken as 3 characters and chids can be 2.
Hence, TIP3S is split into resname TIP and chid 3S.
"""
extend_biomol = kwargs.pop('extend_biomol', False)
n_pdb = len(pdb)
if n_pdb == 0:
raise ValueError('Please provide a PDB ID or filename')
if n_pdb == 1:
if isListLike(pdb[0]) or isinstance(pdb[0], dict):
pdb = pdb[0]
n_pdb = len(pdb)
if n_pdb == 1:
return _parsePDB(pdb[0], **kwargs)
else:
results = []
lstkwargs = {}
for key in kwargs:
argval = kwargs.get(key)
if np.isscalar(argval):
argval = [argval]*n_pdb
lstkwargs[key] = argval
start = time.time()
LOGGER.progress('Retrieving {0} PDB structures...'
.format(n_pdb), n_pdb, '_prody_parsePDB')
for i, p in enumerate(pdb):
kwargs = {}
for key in lstkwargs:
kwargs[key] = lstkwargs[key][i]
c = kwargs.get('chain','')
LOGGER.update(i, 'Retrieving {0}...'.format(p+c),
label='_prody_parsePDB')
result = _parsePDB(p, **kwargs)
if not isinstance(result, tuple):
if isinstance(result, dict):
result = (None, result)
else:
result = (result, None)
results.append(result)
results = list(zip(*results))
LOGGER.finish()
for i in reversed(range(len(results))):
if all(j is None for j in results[i]):
results.pop(i)
if len(results) == 1:
results = results[0]
results = list(results)
model = kwargs.get('model')
header = kwargs.get('header', False)
if model != 0 and header:
numPdbs = len(results[0])
else:
numPdbs = len(results)
if extend_biomol:
results_old = results
results = []
for entry in results_old:
if isinstance(entry, AtomGroup):
results.append(entry)
else:
results.extend(entry)
LOGGER.info('{0} PDBs were parsed in {1:.2f}s.'
.format(numPdbs, time.time()-start))
return results
def _getPDBid(pdb):
l = len(pdb)
if l == 4:
pdbid, chain = pdb, ''
elif l == 5:
pdbid = pdb[:4]; chain = pdb[-1]
elif ':' in pdb:
i = pdb.find(':')
pdbid = pdb[:i]; chain = pdb[i+1:]
else:
raise IOError('{0} is not a valid filename or a valid PDB '
'identifier.'.format(pdb))
if not pdbid.isalnum():
raise IOError('{0} is not a valid filename or a valid PDB '
'identifier.'.format(pdb))
if chain != '' and not chain.isalnum():
raise IOError('{0} is not a valid chain identifier.'.format(chain))
return pdbid, chain
def _parsePDB(pdb, **kwargs):
title = kwargs.get('title', None)
chain = ''
if not os.path.isfile(pdb):
pdb, chain = _getPDBid(pdb)
if title is None:
title = pdb
kwargs['title'] = title
filename = fetchPDB(pdb, **kwargs)
if filename is None:
try:
LOGGER.warn("Trying to parse mmCIF file instead")
chain = kwargs.pop('chain', chain)
return parseMMCIF(pdb+chain, **kwargs)
except OSError:
try:
LOGGER.warn("Trying to parse EMD file instead")
chain = kwargs.pop('chain', chain)
return parseEMD(pdb+chain, **kwargs)
except:
raise IOError('PDB file for {0} could not be parsed.'
.format(pdb))
pdb = filename
if title is None:
title, ext = os.path.splitext(os.path.split(pdb)[1])
if ext == '.gz':
title, ext = os.path.splitext(title)
if len(title) == 7 and title.startswith('pdb'):
title = title[3:]
kwargs['title'] = title
if pdb.endswith('.pdb') or pdb.endswith('.pdb.gz'):
stream = openFile(pdb, 'rt')
if chain != '':
kwargs['chain'] = chain
result = parsePDBStream(stream, **kwargs)
stream.close()
return result
else:
try:
LOGGER.warn("Trying to parse as mmCIF file instead")
return parseMMCIF(pdb, **kwargs)
except KeyError:
try:
LOGGER.warn("Trying to parse as EMD file instead")
return parseEMD(pdb, **kwargs)
except ValueError:
LOGGER.warn("Could not parse anything so returning None")
return None
except:
raise IOError('PDB file for {0} could not be downloaded.'
.format(pdb))
parsePDB.__doc__ += _parsePDBdoc
[docs]def parsePDBStream(stream, **kwargs):
"""Returns an :class:`.AtomGroup` and/or dictionary containing header data
parsed from a stream of PDB lines.
:arg stream: Anything that implements the method ``readlines``
(e.g. :class:`file`, buffer, stdin)"""
model = kwargs.get('model')
header = kwargs.get('header', False)
assert isinstance(header, bool), 'header must be a boolean'
chain = kwargs.get('chain')
subset = kwargs.get('subset')
altloc = kwargs.get('altloc', 'A')
packmol = kwargs.get('packmol', False)
auto_bonds = SETTINGS.get('auto_bonds')
get_bonds = kwargs.get('bonds', auto_bonds)
long_resname = kwargs.get('long_resname')
long_chid = kwargs.get('long_chid')
if model is not None:
if isinstance(model, Integral):
if model < 0:
raise ValueError('model must be greater than 0')
else:
raise TypeError('model must be an integer, {0} is invalid'
.format(str(model)))
title_suffix = ''
if subset:
try:
subset = _PDBSubsets[subset.lower()]
except AttributeError:
raise TypeError('subset must be a string')
except KeyError:
raise ValueError('{0} is not a valid subset'
.format(repr(subset)))
title_suffix = '_' + subset
if chain is not None:
if not isinstance(chain, str):
raise TypeError('chain must be a string')
elif len(chain) == 0:
raise ValueError('chain must not be an empty string')
title_suffix = chain + title_suffix
ag = kwargs.pop('ag', None)
if ag is not None:
if not isinstance(ag, AtomGroup):
raise TypeError('ag must be an AtomGroup instance')
n_csets = ag.numCoordsets()
elif model != 0:
ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix)
n_csets = 0
biomol = kwargs.get('biomol', False)
auto_secondary = SETTINGS.get('auto_secondary')
secondary = kwargs.get('secondary', auto_secondary)
split = 0
hd = None
if model != 0:
LOGGER.timeit()
try:
lines = stream.readlines()
except AttributeError as err:
try:
lines = stream.read().split('\n')
except AttributeError:
raise err
if not len(lines):
raise ValueError('empty PDB file or stream')
if header or biomol or secondary:
hd, split = getHeaderDict(lines)
bonds = [] if get_bonds else None
_parsePDBLines(ag, lines, split, model, chain, subset, altloc, bonds=bonds,
long_resname=long_resname, long_chid=long_chid)
if bonds:
try:
ag.setBonds(bonds)
except ValueError:
LOGGER.warn('Bonds read from CONECT records do not apply to subset so were not added')
if ag.numAtoms() > 0:
LOGGER.report('{0} atoms and {1} coordinate set(s) were '
'parsed in %.2fs.'.format(ag.numAtoms(),
ag.numCoordsets() - n_csets))
else:
ag = None
LOGGER.warn('Atomic data could not be parsed, please '
'check the input file.')
elif header:
hd, split = getHeaderDict(stream)
if ag is not None and isinstance(hd, dict):
if secondary:
if auto_secondary:
try:
ag = assignSecstr(hd, ag)
except ValueError:
pass
else:
ag = assignSecstr(hd, ag)
if biomol:
ag = buildBiomolecules(hd, ag)
if isinstance(ag, list):
LOGGER.info('Biomolecular transformations were applied, {0} '
'biomolecule(s) are returned.'.format(len(ag)))
else:
LOGGER.info('Biomolecular transformations were applied to the '
'coordinate data.')
if packmol:
ag = packmolRenumChains(ag)
if model != 0:
if header:
return ag, hd
else:
return ag
else:
return hd
parsePDBStream.__doc__ += _parsePDBdoc
[docs]def parsePQR(filename, **kwargs):
"""Returns an :class:`.AtomGroup` containing data parsed from PDB lines.
:arg filename: a PQR filename
:type filename: str"""
title = kwargs.get('title', kwargs.get('name'))
chain = kwargs.get('chain')
subset = kwargs.get('subset')
long_resname = kwargs.get('long_resname')
long_chid = kwargs.get('long_chid')
if not os.path.isfile(filename):
raise IOError('No such file: {0}'.format(repr(filename)))
if title is None:
fn, ext = os.path.splitext(os.path.split(filename)[1])
if ext == '.gz':
fn, ext = os.path.splitext(fn)
title = fn.lower()
title_suffix = ''
if subset:
try:
subset = _PDBSubsets[subset.lower()]
except AttributeError:
raise TypeError('subset must be a string')
except KeyError:
raise ValueError('{0} is not a valid subset'
.format(repr(subset)))
title_suffix = '_' + subset
if chain is not None:
if not isinstance(chain, str):
raise TypeError('chain must be a string')
elif len(chain) == 0:
raise ValueError('chain must not be an empty string')
title_suffix = '_' + chain + title_suffix
if 'ag' in kwargs:
ag = kwargs['ag']
if not isinstance(ag, AtomGroup):
raise TypeError('ag must be an AtomGroup instance')
n_csets = ag.numCoordsets()
else:
ag = AtomGroup(title + title_suffix)
n_csets = 0
pqr = openFile(filename, 'rt')
lines = pqr.readlines()
pqr.close()
LOGGER.timeit()
ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain,
subset=subset, altloc_torf=False, format='pqr',
long_resname=long_resname, long_chid=long_chid)
if ag.numAtoms() > 0:
LOGGER.report('{0} atoms and {1} coordinate sets were '
'parsed in %.2fs.'.format(ag.numAtoms(),
ag.numCoordsets() - n_csets))
return ag
else:
return None
parsePQR.__doc__ += _parsePQRdoc
def _parsePDBLines(atomgroup, lines, split, model, chain, subset,
altloc_torf, format='PDB', bonds=None,
long_resname=False, long_chid=False):
"""Returns an AtomGroup. See also :func:`.parsePDBStream()`.
:arg lines: PDB/PQR lines
:arg split: starting index for coordinate data lines"""
format = format.upper()
isPDB = format == 'PDB'
num_ters = 0
if subset:
if subset == 'ca':
subset = set(('CA',))
elif subset in 'bb':
subset = flags.BACKBONE
only_subset = True
protein_resnames = flags.AMINOACIDS
else:
only_subset = False
if chain is None:
only_chains = False
else:
only_chains = True
onlycoords = False
n_atoms = atomgroup.numAtoms()
if n_atoms > 0:
asize = n_atoms
else:
asize = len(lines) - split
addcoords = False
if atomgroup.numCoordsets() > 0:
addcoords = True
alength = asize
coordinates = np.zeros((asize, 3), dtype=float)
atomnames = np.zeros(asize, dtype=ATOMIC_FIELDS['name'].dtype)
resnames = np.zeros(asize, dtype=ATOMIC_FIELDS['resname'].dtype)
resnums = np.zeros(asize, dtype=ATOMIC_FIELDS['resnum'].dtype)
chainids = np.zeros(asize, dtype=ATOMIC_FIELDS['chain'].dtype)
hetero = np.zeros(asize, dtype=bool)
termini = np.zeros(asize, dtype=bool)
altlocs = np.zeros(asize, dtype=ATOMIC_FIELDS['altloc'].dtype)
icodes = np.zeros(asize, dtype=ATOMIC_FIELDS['icode'].dtype)
serials = np.zeros(asize, dtype=ATOMIC_FIELDS['serial'].dtype)
charges = np.zeros(asize, dtype=ATOMIC_FIELDS['charge'].dtype)
if isPDB:
segnames = np.zeros(asize, dtype=ATOMIC_FIELDS['segment'].dtype)
elements = np.zeros(asize, dtype=ATOMIC_FIELDS['element'].dtype)
bfactors = np.zeros(asize, dtype=ATOMIC_FIELDS['beta'].dtype)
occupancies = np.zeros(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype)
siguij = None
else:
radii = np.zeros(asize, dtype=ATOMIC_FIELDS['radius'].dtype)
anisou = None
asize = 2000 # increase array length by this much when needed
start = split
stop = len(lines)
nmodel = 0
bonds_serial = []
# if a specific model is requested, skip lines until that one
if isPDB and model is not None and model != 1:
for i in range(split, len(lines)):
if lines[i][:5] == 'MODEL':
nmodel += 1
if model == nmodel:
start = i+1
stop = len(lines)
break
if nmodel != model:
raise PDBParseError('model {0} is not found'.format(model))
if isinstance(altloc_torf, str):
if altloc_torf == 'all':
which_altlocs = 'all'
elif altloc_torf.strip() != 'A':
LOGGER.info('Parsing alternate locations {0}.'
.format(altloc_torf))
which_altlocs = ' ' + ''.join(altloc_torf.split())
else:
which_altlocs = ' A'
altloc_torf = False
else:
which_altlocs = ' A'
altloc_torf = True
acount = 0
coordsets = None
altloc = defaultdict(list)
i = start
END = False
warned_5_digit = False
warned_long_chid = False
dec = True
while i < stop:
line = lines[i]
startswith = line[0:6].strip()
if startswith == 'ATOM' or startswith == 'HETATM':
if isPDB:
atomname = line[12:16].strip()
if long_resname:
resname = line[17:21].strip()
else:
resname = line[17:20].strip()
else:
fields = line.split()
if fields[5].find('.') != -1:
# coords too early as no chid
fields.insert(4, '')
if len(fields) != 11:
try:
fields = fields[:6] + [line[30:38].strip(), line[38:46].strip(), line[46:54].strip()] + line[54:].split()
except:
LOGGER.warn('wrong number of fields for PQR format at line %d'%i)
i += 1
continue
atomname= fields[2]
resname = fields[3]
if only_subset:
if not (atomname in subset and resname in protein_resnames):
i += 1
continue
if isPDB:
if long_chid:
chid = line[20:22].strip()
if len(chid) > 1 and not warned_long_chid:
LOGGER.warn('Parsed 2-character chid {0} continuous with resnum {1} from {2}. Please check if this was intended.'.format(
chid, line[17:20], line[17:22]
))
warned_long_chid = True
else:
chid = line[21].strip()
else:
chid = fields[4]
if only_chains:
if not chid in chain:
i += 1
continue
if isPDB:
alt = line[16]
if alt not in which_altlocs and which_altlocs != 'all':
altloc[alt].append((line, i))
i += 1
continue
else:
alt = ' '
try:
if isPDB:
coordinates[acount, 0] = line[30:38]
coordinates[acount, 1] = line[38:46]
coordinates[acount, 2] = line[46:54]
else:
coordinates[acount, 0] = fields[6]
coordinates[acount, 1] = fields[7]
coordinates[acount, 2] = fields[8]
except:
if acount >= n_atoms > 0:
if nmodel == 0:
raise ValueError(format + 'file and AtomGroup ag must '
'have same number of atoms')
LOGGER.warn('Discarding model {0}, which contains {1} more '
'atoms than first model does.'
.format(nmodel+1,acount-n_atoms+1))
acount = 0
nmodel += 1
coordinates = np.zeros((n_atoms, 3), dtype=float)
if isPDB:
while lines[i][:6] != 'ENDMDL':
i += 1
else:
raise PDBParseError('invalid or missing coordinate(s) at '
'line {0}'.format(i+1))
if onlycoords:
acount += 1
i += 1
continue
serial_str = line[6:11] if isPDB else fields[1]
try:
serials[acount] = int(serial_str)
except ValueError:
try:
isnumeric = np.alltrue([x.isdigit() for x in serial_str])
if not isnumeric and serial_str == serial_str.upper():
serials[acount] = hybrid36ToDec(serial_str)
else:
# lower case is found in hexadecimal PDB files
serials[acount] = int(serial_str, 16)
except ValueError:
if acount > 0:
LOGGER.warn('failed to parse serial number in line {0}. Assigning it by incrementing.'
.format(i))
serials[acount] = serials[acount-1]+1
else:
LOGGER.warn('failed to parse serial number in line {0}. Assigning it as 1.'
.format(i))
serials[acount] = 1
altlocs[acount] = alt
atomnames[acount] = atomname
resnames[acount] = resname
chainids[acount] = chid
if isPDB:
resnum_str = line[22:26]
icode = line[26]
if dec:
try:
resnum = int(resnum_str)
except ValueError:
dec = False
if icode.isdigit():
if not warned_5_digit:
LOGGER.warn('parsed 5 digit residue number including numeric insertion code')
warned_5_digit = True
resnum = int(str(resnum) + icode)
else:
icodes[acount] = icode
if dec and acount > 2 and resnums[acount-2] > resnum and resnums[acount-2] >= MAX_N_RES:
dec = False
if not dec:
resnum = resnum_str
try:
isnumeric = np.alltrue([x.isdigit() or x==' ' for x in resnum_str])
if not isnumeric and resnum_str == resnum_str.upper():
resnum = hybrid36ToDec(resnum_str, resnum=True)
else:
# lower case is found in hexadecimal PDB files
resnum = int(resnum_str, 16)
except ValueError:
if acount > 0:
LOGGER.warn('failed to parse residue number in line {0}. Assigning it by incrementing.'
.format(i))
resnum = resnums[acount-1]+1
else:
LOGGER.warn('failed to parse residue number in line {0}. Assigning it as 1.'
.format(i))
resnum = 1
resnums[acount] = resnum
else:
resnum = fields[5]
if resnum[-1].isalpha():
icode = resnum[-1]
else:
icode = ' '
try:
resnums[acount] = int(resnum)
except ValueError:
try:
resnums[acount] = int(resnum, 16)
except ValueError:
LOGGER.warn('failed to parse residue number in line {0}. Assigning it by incrementing.'
.format(i))
resnums[acount] = resnums[acount-1]+1
icodes[acount] = icode
if isPDB:
try:
occupancies[acount] = line[54:60]
except:
LOGGER.warn('failed to parse occupancy at line {0}'
.format(i))
try:
bfactors[acount] = line[60:66]
except:
LOGGER.warn('failed to parse beta-factor at line {0}'
.format(i))
hetero[acount] = startswith[0] == 'H'
segnames[acount] = line[72:76]
elements[acount] = line[76:78]
try:
charges[acount] = int(line[79] + line[78])
except:
charges[acount] = 0
else:
try:
charges[acount] = fields[9]
except:
LOGGER.warn('failed to parse charge at line {0}'
.format(i))
try:
radii[acount] = fields[10]
except:
LOGGER.warn('failed to parse radius at line {0}'
.format(i))
acount += 1
if n_atoms == 0 and acount >= alength:
# if arrays are short extend them with zeros
alength += asize
coordinates = np.concatenate(
(coordinates, np.zeros((asize, 3), float)))
atomnames = np.concatenate((atomnames,
np.zeros(asize, ATOMIC_FIELDS['name'].dtype)))
resnames = np.concatenate((resnames,
np.zeros(asize, ATOMIC_FIELDS['resname'].dtype)))
resnums = np.concatenate((resnums,
np.zeros(asize, ATOMIC_FIELDS['resnum'].dtype)))
chainids = np.concatenate((chainids,
np.zeros(asize, ATOMIC_FIELDS['chain'].dtype)))
hetero = np.concatenate((hetero, np.zeros(asize, bool)))
termini = np.concatenate((termini, np.zeros(asize, bool)))
altlocs = np.concatenate((altlocs,
np.zeros(asize, ATOMIC_FIELDS['altloc'].dtype)))
icodes = np.concatenate((icodes,
np.zeros(asize, ATOMIC_FIELDS['icode'].dtype)))
serials = np.concatenate((serials,
np.zeros(asize, ATOMIC_FIELDS['serial'].dtype)))
if isPDB:
bfactors = np.concatenate((bfactors,
np.zeros(asize, ATOMIC_FIELDS['beta'].dtype)))
occupancies = np.concatenate((occupancies,
np.zeros(asize, ATOMIC_FIELDS['occupancy'].dtype)))
segnames = np.concatenate((segnames,
np.zeros(asize, ATOMIC_FIELDS['segment'].dtype)))
elements = np.concatenate((elements,
np.zeros(asize, ATOMIC_FIELDS['element'].dtype)))
if anisou is not None:
anisou = np.concatenate((anisou, np.zeros((asize, 6), float)))
if siguij is not None:
siguij = np.concatenate((siguij, np.zeros((asize, 6),
ATOMIC_FIELDS['siguij'].dtype)))
else:
charges = np.concatenate((charges,
np.zeros(asize, ATOMIC_FIELDS['charge'].dtype)))
radii = np.concatenate((radii,
np.zeros(asize, ATOMIC_FIELDS['radius'].dtype)))
elif startswith == 'CONECT':
if bonds is not None:
atom_serial = line[6:11]
bonded1_serial = line[11:16]
bonds_serial.append([int(atom_serial), int(bonded1_serial)])
bonded2_serial = line[16:21]
if len(bonded2_serial.strip()):
bonds_serial.append([int(atom_serial), int(bonded2_serial)])
bonded3_serial = line[21:26]
if len(bonded3_serial.strip()):
bonds_serial.append([int(atom_serial), int(bonded3_serial)])
bonded4_serial = line[26:31] # fixed typo
if len(bonded4_serial.strip()):
bonds_serial.append([int(atom_serial), int(bonded4_serial)])
elif not onlycoords and (startswith == 'TER ' or
startswith.strip() == 'TER'):
termini[acount - 1] = True
num_ters += 1
elif startswith == 'ENDMDL' or startswith[:3] == 'END':
if acount == 0:
# If there is no atom record between ENDMDL & END skip to next
i += 1
continue
if model is not None:
if bonds is None:
i += 1
break
else:
i += 1
for j in range(i, stop):
if lines[j].startswith('CONECT'):
i = j
break
continue
diff = stop - i - 1
END = diff < acount
if coordsets is not None:
END = END or nmodel >= coordsets.shape[0]
if onlycoords:
if acount < n_atoms:
LOGGER.warn('Discarding model {0}, which contains '
'{1} fewer atoms than the first model '
'does.'.format(nmodel+1, n_atoms-acount))
else:
coordsets[nmodel] = coordinates
nmodel += 1
acount = 0
if not END:
coordinates = coordsets[nmodel]
else:
if acount != n_atoms > 0:
raise ValueError('PDB file and AtomGroup ag must have '
'same number of atoms')
# this is where to decide if more coordsets should be expected
if END:
coordinates.resize((acount, 3), refcheck=False)
if addcoords:
atomgroup.addCoordset(coordinates, anisous=anisou)
else:
atomgroup._setCoords(coordinates)
if isPDB and anisou is not None:
anisou.resize((acount, 6), refcheck=False)
atomgroup.setAnisous(anisou / 10000)
else:
coordsets = np.zeros((int(diff//acount+1), acount, 3))
coordsets[0] = coordinates[:acount]
onlycoords = True
atomnames.resize(acount, refcheck=False)
resnames.resize(acount, refcheck=False)
resnums.resize(acount, refcheck=False)
chainids.resize(acount, refcheck=False)
hetero.resize(acount, refcheck=False)
termini.resize(acount, refcheck=False)
altlocs.resize(acount, refcheck=False)
icodes.resize(acount, refcheck=False)
serials.resize(acount, refcheck=False)
if not only_subset:
atomnames = np.char.strip(atomnames)
resnames = np.char.strip(resnames)
atomgroup.setNames(atomnames)
atomgroup.setResnames(resnames)
atomgroup.setResnums(resnums)
atomgroup.setChids(chainids)
atomgroup.setFlags('hetatm', hetero)
atomgroup.setFlags('pdbter', termini)
atomgroup.setFlags('selpdbter', termini)
atomgroup.setAltlocs(altlocs)
atomgroup.setIcodes(np.char.strip(icodes))
atomgroup.setSerials(serials)
if isPDB:
bfactors.resize(acount, refcheck=False)
occupancies.resize(acount, refcheck=False)
segnames.resize(acount, refcheck=False)
elements.resize(acount, refcheck=False)
atomgroup.setBetas(bfactors)
atomgroup.setOccupancies(occupancies)
atomgroup.setSegnames(np.char.strip(segnames))
atomgroup.setElements(np.char.strip(elements))
from prody.utilities.misctools import getMasses
atomgroup.setMasses(getMasses(np.char.strip(elements)))
if anisou is not None:
anisou.resize((acount, 6), refcheck=False)
atomgroup.setAnisous(anisou / 10000)
if siguij is not None:
siguij.resize((acount, 6), refcheck=False)
atomgroup.setAnistds(siguij / 10000)
else:
radii.resize(acount, refcheck=False)
atomgroup.setRadii(radii)
charges.resize(acount, refcheck=False)
atomgroup.setCharges(charges)
nmodel += 1
n_atoms = acount
acount = 0
coordinates = np.zeros((n_atoms, 3), dtype=float)
if altloc and altloc_torf:
_evalAltlocs(atomgroup, altloc, chainids, resnums,
resnames, atomnames)
altloc = defaultdict(list)
if END:
break
elif isPDB and startswith == 'ANISOU':
if anisou is None:
anisou = True
anisou = np.zeros((alength, 6), dtype=float)
alt = line[16]
if alt not in which_altlocs and which_altlocs != 'all':
altloc[alt].append((line, i))
i += 1
continue
try:
index = acount - 1
anisou[index, 0] = line[28:35]
anisou[index, 1] = line[35:42]
anisou[index, 2] = line[43:49]
anisou[index, 3] = line[49:56]
anisou[index, 4] = line[56:63]
anisou[index, 5] = line[63:70]
except:
LOGGER.warn('failed to parse anisotropic temperature '
'factors at line {0}'.format(i))
elif isPDB and startswith =='SIGUIJ':
if siguij is None:
siguij = np.zeros((alength, 6),
dtype=ATOMIC_FIELDS['siguij'].dtype)
try:
index = acount - 1
siguij[index, 0] = line[28:35]
siguij[index, 1] = line[35:42]
siguij[index, 2] = line[43:49]
siguij[index, 3] = line[49:56]
siguij[index, 4] = line[56:63]
siguij[index, 5] = line[63:70]
except:
LOGGER.warn('failed to parse standard deviations of '
'anisotropic temperature factors at line {0}'.format(i))
elif startswith =='SIGATM':
pass
i += 1
if onlycoords:
if acount == atomgroup.numAtoms():
coordsets[nmodel] = coordinates
nmodel += 1
del coordinates
coordsets.resize((nmodel, atomgroup.numAtoms(), 3), refcheck=False)
if addcoords:
atomgroup.addCoordset(coordsets)
else:
atomgroup._setCoords(coordsets)
elif not END:
# this means last line was an ATOM line, so atomgroup is not decorated
coordinates.resize((acount, 3), refcheck=False)
if addcoords:
atomgroup.addCoordset(coordinates, anisous=anisou)
else:
atomgroup._setCoords(coordinates)
atomnames.resize(acount, refcheck=False)
resnames.resize(acount, refcheck=False)
resnums.resize(acount, refcheck=False)
chainids.resize(acount, refcheck=False)
hetero.resize(acount, refcheck=False)
termini.resize(acount, refcheck=False)
altlocs.resize(acount, refcheck=False)
icodes.resize(acount, refcheck=False)
serials.resize(acount, refcheck=False)
if not only_subset:
atomnames = np.char.strip(atomnames)
resnames = np.char.strip(resnames)
atomgroup.setNames(atomnames)
atomgroup.setResnames(resnames)
atomgroup.setResnums(resnums)
atomgroup.setChids(chainids)
atomgroup.setFlags('hetatm', hetero)
atomgroup.setFlags('pdbter', termini)
atomgroup.setFlags('selpdbter', termini)
atomgroup.setAltlocs(altlocs)
atomgroup.setIcodes(np.char.strip(icodes))
atomgroup.setSerials(serials)
if isPDB:
if anisou is not None:
anisou.resize((acount, 6), refcheck=False)
atomgroup.setAnisous(anisou / 10000)
if siguij is not None:
siguij.resize((acount, 6), refcheck=False)
atomgroup.setAnistds(siguij / 10000)
bfactors.resize(acount, refcheck=False)
occupancies.resize(acount, refcheck=False)
segnames.resize(acount, refcheck=False)
elements.resize(acount, refcheck=False)
atomgroup.setSegnames(np.char.strip(segnames))
atomgroup.setElements(np.char.strip(elements))
from prody.utilities.misctools import getMasses
atomgroup.setMasses(getMasses(np.char.strip(elements)))
atomgroup.setBetas(bfactors)
atomgroup.setOccupancies(occupancies)
else:
radii.resize(acount, refcheck=False)
atomgroup.setRadii(radii)
charges.resize(acount, refcheck=False)
atomgroup.setCharges(charges)
# rematch the bond-atom serial numbers to atom ids
serial_to_id = {int(serial): aidx for aidx, serial in enumerate(serials)}
for bond in bonds_serial:
try:
bonds.append([serial_to_id[bond[0]], serial_to_id[bond[1]]])
except KeyError:
LOGGER.warn("Bond connecting atom serial numbers {0}"
" and {1} contains atoms not included in the model"
.format(bond[0], bond[1])
)
if altloc and altloc_torf:
_evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames)
return atomgroup
def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames):
altloc_keys = list(altloc)
altloc_keys.sort()
indices = {}
for key in altloc_keys:
xyz = atomgroup.getCoords()
anisou = atomgroup.getAnisous()
success = 0
lines = altloc[key]
for line, i in lines:
aan = line[12:16].strip()
arn = line[17:21].strip()
ach = line[21]
ari = int(line[22:26].split()[0])
rn, ids, ans = indices.get((ach, ari), (None, None, None))
if ids is None:
ids = indices.get(ach, None)
if ids is None:
ids = (chainids == ach).nonzero()[0]
indices[ach] = ids
ids = ids[resnums[ids] == ari]
if len(ids) == 0:
LOGGER.warn("failed to parse altloc {0} at line {1}, "
"residue not present for altloc 'A'".format(
repr(key), i+1))
continue
rn = resnames[ids[0]]
ans = atomnames[ids]
indices[(ach, ari)] = (rn, ids, ans)
if rn != arn:
LOGGER.warn("failed to parse altloc {0} at line {1}, "
"residue name mismatch (expected {2}, "
"parsed {3})".format(repr(key), i+1, repr(rn),
repr(arn)))
continue
index = ids[(ans == aan).nonzero()[0]]
if len(index) != 1:
LOGGER.warn("failed to parse altloc {0} at line {1}, atom"
" {2} not found in the residue"
.format(repr(key), i+1, repr(aan)))
continue
if line.startswith('ATOM '):
try:
xyz[index[0], 0] = float(line[30:38])
xyz[index[0], 1] = float(line[38:46])
xyz[index[0], 2] = float(line[46:54])
except:
LOGGER.warn('failed to parse altloc {0} at line {1}, could'
' not read coordinates'.format(repr(key), i+1))
continue
success += 1
elif line.startswith('ANISOU'):
try:
anisou[index[0], 0] = line[28:35]
anisou[index[0], 1] = line[35:42]
anisou[index[0], 2] = line[43:49]
anisou[index[0], 3] = line[49:56]
anisou[index[0], 4] = line[56:63]
anisou[index[0], 5] = line[63:70]
except:
LOGGER.warn('failed to parse altloc {0} at line {1}, could'
' not read anisous'.format(repr(key), i+1))
success += 1
LOGGER.info('{0} out of {1} altloc {2} lines were parsed.'
.format(success, len(lines), repr(key)))
if success > 0:
LOGGER.info('Altloc {0} is appended as a coordinate set to '
'atomgroup {1}.'.format(repr(key), atomgroup.getTitle()))
atomgroup.addCoordset(xyz, label='altloc ' + key, anisous=anisou)
#HELIXLINE = ('HELIX %3d %3s %-3s %1s %4d%1s %-3s %1s %4d%1s%2d'
# ' %5d\n')
HELIXLINE = ('HELIX {serNum:3d} {helixID:>3s} '
'{initResName:<3s}{initChainID:>2s} {initSeqNum:4d}{initICode:1s} '
'{endResName:<3s}{endChainID:>2s} {endSeqNum:4d}{endICode:1s}'
'{helixClass:2d} {length:5d} \n')
SHEETLINE = ('SHEET {strand:3d} {sheetID:>3s}{numStrands:2d} '
'{initResName:3s}{initChainID:>2s}{initSeqNum:4d}{initICode:1s} '
'{endResName:3s}{endChainID:>2s}{endSeqNum:4d}{endICode:1s}{sense:2d} \n')
PDBLINE_LT100K = ('%-6s%5d %-4s%1s%-3s%2s%4d%1s '
'%8.3f%8.3f%8.3f%6.2f%6.2f '
'%4s%2s%2s\n')
# Residue number
PDBLINE_GE10K = ('%-6s%5d %-4s%1s%-3s%2s%4x%1s '
'%8.3f%8.3f%8.3f%6.2f%6.2f '
'%4s%2s%2s\n')
# Serial number
PDBLINE_GE100K = ('%-6s%5x %-4s%1s%-3s%2s%4d%1s '
'%8.3f%8.3f%8.3f%6.2f%6.2f '
'%4s%2s%2s\n')
# Both
PDBLINE_GE100K_GE10K = ('%-6s%5x %-4s%1s%-3s%2s%4x%1s '
'%8.3f%8.3f%8.3f%6.2f%6.2f '
'%4s%2s%2s\n')
# All cases
PDBLINE_H36 = ('%-6s%5s %-4s%1s%-3s%2s%4s%1s '
'%8.3f%8.3f%8.3f%6.2f%6.2f '
'%4s%2s%2s\n')
ANISOULINE_LT100K = ('%-6s%5d %-4s%1s%-3s%2s%4d%1s '
'%7d%7d%7d%7d%7d%7d '
'%4s%2s%2s\n')
# Residue number
ANISOULINE_GE10K = ('%-6s%5d %-4s%1s%-3s%2s%4x%1s '
'%7d%7d%7d%7d%7d%7d '
'%4s%2s%2s\n')
# Serial number
ANISOULINE_GE100K = ('%-6s%5x %-4s%1s%-3s%2s%4d%1s '
'%7d%7d%7d%7d%7d%7d '
'%4s%2s%2s\n')
# Both
ANISOULINE_GE100K_GE10K = ('%-6s%5x %-4s%1s%-3s%2s%4x%1s '
'%7d%7d%7d%7d%7d%7d '
'%4s%2s%2s\n')
# All cases
ANISOULINE_H36 = ('%-6s%5s %-4s%1s%-3s%2s%4s%1s '
'%7d%7d%7d%7d%7d%7d '
'%4s%2s%2s\n')
_writePDBdoc = """
:arg atoms: an object with atom and coordinate data
:arg csets: coordinate set indices, default is all coordinate sets
:arg beta: a list or array of number to be outputted in beta column
:arg occupancy: a list or array of number to be outputted in occupancy
column
:arg hybrid36: whether to use hybrid36 format for atoms with serial
greater than 99999. Hexadecimal is used otherwise.
Default is False
:type hybrid36: bool
"""
[docs]def writeChainsList(chains, filename):
"""
Write a text file containing a list of chains that can be parsed.
:arg chains: a list of :class:`.Chain` objects
:type chains: list
:arg filename: the name of the file to be written
:type filename: str
"""
fo = open(filename,'w')
for chain in chains:
fo.write(chain.getTitle() + ' ' + chain.getChid() + '\n')
fo.close()
return
[docs]def parseChainsList(filename):
"""
Parse a set of PDBs and extract chains based on a list in a text file.
:arg filename: the name of the file to be read
:type filename: str
Returns: lists containing an :class:'.AtomGroup' for each PDB,
the headers for those PDBs, and the requested :class:`.Chain` objects
"""
fi = open(filename,'r')
lines = fi.readlines()
fi.close()
pdb_ids = []
ags = []
headers = []
chains = []
num_lines = len(lines)
LOGGER.progress('Starting', num_lines, '_prody_parseChainsList')
for i, line in enumerate(lines):
LOGGER.update(i, 'Parsing lines...', label='_prody_parseChainsList')
pdb_id = line.split()[0].split('_')[0]
if not pdb_id in pdb_ids:
pdb_ids.append(pdb_id)
ag, header = parsePDB(pdb_id, compressed=False, \
subset=line.split()[0].split('_')[1], header=True)
ags.append(ag)
headers.append(header)
chains.append(ag.getHierView()[line.strip().split()[1]])
LOGGER.finish()
LOGGER.info('{0} PDBs have been parsed and {1} chains have been extracted. \
'.format(len(ags),len(chains)))
return ags, headers, chains
[docs]def writePDBStream(stream, atoms, csets=None, **kwargs):
"""Write *atoms* in PDB format to a *stream*.
:arg stream: anything that implements a :meth:`write` method (e.g. file,
buffer, stdout)
:arg renumber: whether to renumber atoms with serial indices
Default is **True**
:type renumber: bool
:arg hybrid36: whether to use hybrid36 format for atom residue numbers
Default is **False**, which means using hexadecimal instead.
NB: ChimeraX seems to prefer hybrid36 and may have problems with hexadecimal.
:type hybrid36: bool
:arg full_ter: whether to write full TER lines with atoms info
Default is **True**
:type full_ter: bool
:arg write_remarks: whether to write REMARK lines
Default is **True**
:type write_remarks: bool
"""
renumber = kwargs.get('renumber', True)
full_ter = kwargs.get('full_ter', True)
write_remarks = kwargs.get('write_remarks', True)
remark = str(atoms)
try:
coordsets = atoms._getCoordsets(csets)
except AttributeError:
try:
coordsets = atoms._getCoords()
except AttributeError:
raise TypeError('atoms must be an object with coordinate sets')
if coordsets is not None:
coordsets = [coordsets]
else:
if coordsets.ndim == 2:
coordsets = [coordsets]
if coordsets is None:
raise ValueError('atoms does not have any coordinate sets')
had_atoms = False
try:
acsi = atoms.getACSIndex()
had_atoms = True
except AttributeError:
try:
atoms = atoms.getAtoms()
except AttributeError:
raise TypeError('atoms must be an Atomic instance or an object '
'with `getAtoms` method')
else:
if atoms is None:
raise ValueError('atoms is not associated with an Atomic '
'instance')
try:
acsi = atoms.getACSIndex()
except AttributeError:
raise TypeError('atoms does not have a valid type')
try:
atoms.getIndex()
except AttributeError:
pass
else:
atoms = atoms.select('all')
n_atoms = atoms.numAtoms()
hybrid36 = kwargs.get('hybrid36', False)
occupancy = kwargs.get('occupancy')
if occupancy is None:
occupancies = atoms._getOccupancies()
if occupancies is None:
occupancies = np.zeros(n_atoms, float)
else:
occupancies = np.array(occupancy)
if len(occupancies) != n_atoms:
raise ValueError('len(occupancy) must be equal to number of atoms')
beta = kwargs.get('beta')
if beta is None:
bfactors = atoms._getBetas()
if bfactors is None:
bfactors = np.zeros(n_atoms, float)
else:
bfactors = np.array(beta)
if len(bfactors) != n_atoms:
raise ValueError('len(beta) must be equal to number of atoms')
atomnames = atoms.getNames()
if atomnames is None:
raise ValueError('atom names are not set')
for i, an in enumerate(atomnames):
if len(an) < 4:
atomnames[i] = ' ' + an
s_or_u = np.array(['a']).dtype.char
altlocs = atoms._getAltlocs()
if altlocs is None:
altlocs = np.zeros(n_atoms, s_or_u + '1')
resnames = atoms._getResnames()
if resnames is None:
resnames = ['UNK'] * n_atoms
chainids = atoms._getChids()
if chainids is None:
chainids = np.zeros(n_atoms, s_or_u + '1')
resnums = atoms._getResnums()
if resnums is None:
resnums = np.ones(n_atoms, int)
serials = atoms._getSerials()
if serials is None or renumber:
serials = np.arange(n_atoms, dtype=int) + 1
icodes = atoms._getIcodes()
if icodes is None:
icodes = np.zeros(n_atoms, s_or_u + '1')
hetero = ['ATOM'] * n_atoms
heteroflags = atoms._getFlags('hetatm')
if heteroflags is None:
heteroflags = atoms._getFlags('hetero')
if heteroflags is not None:
hetero = np.array(hetero, s_or_u + '6')
hetero[heteroflags] = 'HETATM'
elements = atoms._getElements()
if elements is None:
elements = np.zeros(n_atoms, s_or_u + '1')
else:
elements = np.char.rjust(elements, 2)
segments = atoms._getSegnames()
if segments is None:
segments = np.zeros(n_atoms, s_or_u + '6')
charges = atoms._getCharges()
charges2 = np.empty(n_atoms, s_or_u + '2')
if charges is not None:
for i, charge in enumerate(charges):
charges2[i] = str(abs(int(charge)))
if np.sign(charge) == -1:
charges2[i] += '-'
else:
charges2[i] += '+'
if charges2[i] == '0+':
charges2[i] = ' '
if write_remarks:
# write remarks
stream.write('REMARK {0}\n'.format(remark))
# write secondary structures (if any)
secondary = kwargs.get('secondary', True)
secstrs = atoms._getSecstrs()
if secstrs is not None and secondary:
secindices = atoms._getSecindices()
secclasses = atoms._getSecclasses()
secids = atoms._getSecids()
# write helices
for i in range(1,max(secindices)+1):
torf = np.logical_and(isHelix(secstrs), secindices==i)
if torf.any():
helix_resnums = resnums[torf]
helix_chainids = chainids[torf]
helix_resnames = resnames[torf]
helix_secclasses = secclasses[torf]
helix_secids = secids[torf]
helix_icodes = icodes[torf]
L = helix_resnums[-1] - helix_resnums[0] + 1
stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0],
initResName=helix_resnames[0][:3], initChainID=helix_chainids[0],
initSeqNum=helix_resnums[0], initICode=helix_icodes[0],
endResName=helix_resnames[-1][:3], endChainID=helix_chainids[-1],
endSeqNum=helix_resnums[-1], endICode=helix_icodes[-1],
helixClass=helix_secclasses[0], length=L))
# write strands
torf_all_sheets = isSheet(secstrs)
sheet_secids = secids[torf_all_sheets]
unique_sheet_secids, indices = np.unique(sheet_secids, return_index=True)
unique_sheet_secids = unique_sheet_secids[indices.argsort()]
for sheet_id in unique_sheet_secids:
torf_strands_in_sheet = np.logical_and(torf_all_sheets, secids==sheet_id)
strand_indices = secindices[torf_strands_in_sheet]
numStrands = len(np.unique(strand_indices))
for i in np.unique(strand_indices):
torf_strand = np.logical_and(torf_strands_in_sheet, secindices==i)
strand_resnums = resnums[torf_strand]
strand_chainids = chainids[torf_strand]
strand_resnames = resnames[torf_strand]
strand_secclasses = secclasses[torf_strand]
strand_icodes = icodes[torf_strand]
stream.write(SHEETLINE.format(strand=i, sheetID=sheet_id, numStrands=numStrands,
initResName=strand_resnames[0][:3], initChainID=strand_chainids[0],
initSeqNum=strand_resnums[0], initICode=strand_icodes[0],
endResName=strand_resnames[-1][:3], endChainID=strand_chainids[-1],
endSeqNum=strand_resnums[-1], endICode=strand_icodes[-1],
sense=strand_secclasses[0]))
pass
# write atoms
multi = len(coordsets) > 1
write = stream.write
num_ter_lines = 0
for m, coords in enumerate(coordsets):
if had_atoms:
atoms.setACSIndex(m)
anisous = atoms._getAnisous()
if anisous is not None:
anisous = np.array(anisous * 10000, dtype=int)
if multi:
write('MODEL{0:9d}\n'.format(m+1))
if not hybrid36:
# We need to check whether serial and residue numbers become hexadecimal
reached_max_n_atom = False
reached_max_n_res = False
pdbline = PDBLINE_LT100K
anisouline = ANISOULINE_LT100K
else:
warned_hybrid36 = False
warned_5_digit = False
warned_long_resname = False
for i, xyz in enumerate(coords):
serial = serials[i] + num_ter_lines
if hybrid36:
pdbline = PDBLINE_H36
anisouline = ANISOULINE_H36
if not warned_hybrid36:
LOGGER.warn('hybrid36 format is being used')
warned_hybrid36 = True
else:
if not (reached_max_n_atom or reached_max_n_res) and (i == MAX_N_ATOM or serial > MAX_N_ATOM):
reached_max_n_atom = True
pdbline = PDBLINE_GE100K
anisouline = ANISOULINE_GE100K
LOGGER.warn('Indices are exceeding 99999 and hexadecimal format is being used for indices')
elif not (reached_max_n_atom or reached_max_n_res) and resnums[i] > MAX_N_RES:
reached_max_n_res = True
pdbline = PDBLINE_GE10K
anisouline = ANISOULINE_GE10K
LOGGER.warn('Resnums are exceeding 9999 and hexadecimal format is being used for resnums')
elif reached_max_n_atom and not reached_max_n_res and resnums[i] > MAX_N_RES:
reached_max_n_res = True
pdbline = PDBLINE_GE100K_GE10K
anisouline = ANISOULINE_GE100K_GE10K
LOGGER.warn('Resnums are exceeding 9999 and hexadecimal format is being used for indices and resnums')
elif reached_max_n_res and not reached_max_n_atom and (i == MAX_N_ATOM or serial > MAX_N_ATOM):
reached_max_n_atom = True
pdbline = PDBLINE_GE100K_GE10K
anisouline = ANISOULINE_GE100K_GE10K
LOGGER.warn('Indices are exceeding 99999 and hexadecimal format is being used for indices and resnums')
if hybrid36:
serial = decToHybrid36(serial)
resnum = decToHybrid36(resnums[i], resnum=True)
else:
resnum = resnums[i]
resname = resnames[i]
if len(resnames[i]) > 3:
resname = resname[:3]
if not warned_long_resname:
LOGGER.warn('Resname {0} too long, cutting resname to 3 characters as {1}'.format(
resnames[i], resname
))
warned_long_resname = True
if pdbline == PDBLINE_LT100K or hybrid36:
if len(str(resnum)) == 5:
if icodes[i] == '':
icodes[i] = str(resnum)[4]
if not warned_5_digit:
LOGGER.warn('Storing 5-digit resnums using insertion codes')
warned_5_digit = True
else:
LOGGER.warn('Truncating 5-digit resnum as insertion code is busy.')
resnum = int(str(resnum)[:4])
elif len(str(resnum)) > 5:
if not warned_5_digit:
LOGGER.warn('Truncating {0}-digit resnum as too long to be '
'supported by insertion code.'.format(len(str(resnum))))
warned_5_digit = True
resnum = int(str(resnum)[:4])
else:
final_resnum = '%4x' % int(resnum)
if len(str(final_resnum)) == 5:
if icodes[i] == '':
icodes[i] = str(final_resnum)[4]
if not warned_5_digit:
LOGGER.warn('Storing 5-digit hex resnums using insertion codes')
warned_5_digit = True
else:
LOGGER.warn('Truncating 5-digit hex resnum as insertion code is busy.')
resnum = int(str(final_resnum)[:4], 16)
elif len(str(final_resnum)) > 5:
if not warned_5_digit:
LOGGER.warn('Truncating {0}-digit hex resnum ({1}) as too long to be '
'supported by insertion code.'.format(len(str(final_resnum)),
final_resnum))
warned_5_digit = True
resnum = int(str(final_resnum)[:4], 16)
write(pdbline % (hetero[i], serial,
atomnames[i], altlocs[i],
resname, chainids[i][:1], resnum,
icodes[i],
xyz[0], xyz[1], xyz[2],
occupancies[i], bfactors[i],
segments[i], elements[i], charges2[i]))
if anisous is not None:
anisou = anisous[i]
write(anisouline % ("ANISOU", serial,
atomnames[i], altlocs[i],
resname, chainids[i], resnum,
icodes[i],
anisou[0], anisou[1], anisou[2],
anisou[3], anisou[4], anisou[5],
segments[i], elements[i], charges2[i]))
if isinstance(atoms, AtomGroup):
if resnames[i] in AAMAP and atoms._getFlags('pdbter') is not None and atoms.getFlags('pdbter')[i]:
if hybrid36:
serial = decToHybrid36(hybrid36ToDec(serial) + 1)
else:
serial += 1
if full_ter:
false_pdbline = pdbline % ("TER ", serial,
"", "",
resname, chainids[i], resnum,
icodes[i],
xyz[0], xyz[1], xyz[2],
occupancies[i], bfactors[i],
segments[i], elements[i], charges2[i])
else:
false_pdbline = "TER" + " "*23
write(false_pdbline[:26] + " "*54 + '\n')
num_ter_lines += 1
else:
if resnames[i] in AAMAP and atoms._getFlags('selpdbter') is not None and atoms.getFlags('selpdbter')[i]:
if hybrid36:
serial = decToHybrid36(hybrid36ToDec(serial) + 1)
else:
serial += 1
if full_ter:
false_pdbline = pdbline % ("TER ", serial,
"", "",
resname, chainids[i], resnum,
icodes[i],
xyz[0], xyz[1], xyz[2],
occupancies[i], bfactors[i],
segments[i], elements[i], charges2[i])
else:
false_pdbline = "TER" + " "*23
write(false_pdbline[:26] + " "*54 + '\n')
num_ter_lines += 1
if multi:
write('ENDMDL' + " "*74 + '\n')
altlocs = np.zeros(n_atoms, s_or_u + '1')
write('END ' + " "*74 + '\n')
if had_atoms:
atoms.setACSIndex(acsi)
writePDBStream.__doc__ += _writePDBdoc
[docs]def writePDB(filename, atoms, csets=None, autoext=True, **kwargs):
"""Write *atoms* in PDB format to a file with name *filename* and return
*filename*. If *filename* ends with :file:`.gz`, a compressed file will
be written.
:arg renumber: whether to renumber atoms with serial indices
Default is **True**
:type renumber: bool
:arg hybrid36: whether to use hybrid36 format for atom residue numbers
Default is **False**, which means using hexadecimal instead.
NB: ChimeraX seems to prefer hybrid36 and may have problems with hexadecimal.
:type hybrid36: bool
Please note that resnames longer than 3 characters will be trimmed.
"""
if not (filename.lower().endswith('.pdb') or filename.lower().endswith('.pdb.gz') or
filename.lower().endswith('.ent') or filename.lower().endswith('.ent.gz')) and autoext:
filename += '.pdb'
out = openFile(filename, 'wt')
writePDBStream(out, atoms, csets, **kwargs)
out.close()
return filename
writePDB.__doc__ += _writePDBdoc + """
:arg autoext: when not present, append extension :file:`.pdb` to *filename*
"""
[docs]def writePQRStream(stream, atoms, **kwargs):
if isinstance(atoms, Atom):
atoms = Selection(atoms.getAtomGroup(), [atoms.getIndex()],
atoms.getACSIndex(),
'index ' + str(atoms.getIndex()))
n_atoms = atoms.numAtoms()
atomnames = atoms.getNames()
if atomnames is None:
raise RuntimeError('atom names are not set')
for i, an in enumerate(atomnames):
lenan = len(an)
if lenan < 4:
atomnames[i] = ' ' + an
elif lenan > 4:
atomnames[i] = an[:4]
s_or_u = np.array(['a']).dtype.char
write = stream.write
calphas = atoms.ca
if calphas is not None:
ssa = calphas.getSecstrs()
else:
ssa = None
helix = []
sheet = []
if ssa is not None:
ss_prev = ssa[0]
ss_start = 0
ss_end = 1
for i, ss in enumerate(ssa):
if ss != ss_prev:
# store prev secstr and prepare for next
ss_end = i-1
init = calphas[ss_start]
end = calphas[ss_end]
length = ss_end - ss_start + 1
entries = [init.getSecindex(), init.getSecid(),
init.getResname(), init.getChid(),
init.getResnum(), init.getIcode(),
end.getResname(), end.getChid(),
end.getResnum(), end.getIcode(),
init.getSecclass()]
if ssa[ss_end] == 'H':
helix.append(["HELIX "] + entries +
['', length])
elif ssa[ss_end] == 'E':
sheet.append(["SHEET "] + entries)
ss_start = i
ss_prev = ss
format_helix = ('{0:6s} {1:3d} {2:3s} ' +
'{3:3s} {4:1s} {5:4d}{6:1s} ' +
'{7:3s} {8:1s} {9:4d}{10:1s} ' +
'{11:2d} {12:30s} {13:5d}\n').format
for line in helix:
write(format_helix(*line))
sheet_prev = 'A'
num_strands_list = []
for i, item1 in enumerate(sheet):
if item1[2] != sheet_prev:
num_strands = sheet[i-1][1]
num_strands_list.append(num_strands)
sheet_prev = item1[2]
for item2 in sheet[i-num_strands:i]:
item2.append(num_strands)
num_strands = len(num_strands_list)
for item2 in sheet[i-num_strands+1:]:
item2.append(num_strands)
format_sheet = ('{0:6s} {1:3d} {2:3s}{12:2d} ' +
'{3:3s} {4:1s}{5:4d}{6:1s}' +
'{7:3s} {8:1s}{9:4d}{10:1s}' +
'{11:2d}\n').format
for i, line in enumerate(sheet):
write(format_sheet(*line))
resnames = atoms._getResnames()
if resnames is None:
resnames = ['UNK'] * n_atoms
resnums = atoms._getResnums()
if resnums is None:
resnums = np.ones(n_atoms, int)
chainids = atoms._getChids()
if chainids is None:
chainids = np.zeros(n_atoms, s_or_u + '1')
charges = atoms._getCharges()
if charges is None:
charges = np.zeros(n_atoms, float)
radii = atoms._getRadii()
if radii is None:
radii = np.zeros(n_atoms, float)
icodes = atoms._getIcodes()
if icodes is None:
icodes = np.zeros(n_atoms, s_or_u + '1')
hetero = ['ATOM'] * n_atoms
heteroflags = atoms._getFlags('hetatm')
if heteroflags is None:
heteroflags = atoms._getFlags('hetero')
if heteroflags is not None:
hetero = np.array(hetero, s_or_u + '6')
hetero[heteroflags] = 'HETATM'
altlocs = atoms._getAltlocs()
if altlocs is None:
altlocs = np.zeros(n_atoms, s_or_u + '1')
format = ('{0:6s} {1:5d} {2:4s} {3:1s}' +
'{4:4s} {5:1s} {6:4d} {7:1s} ' +
'{8:8.3f} {9:8.3f} {10:8.3f}' +
'{11:8.4f} {12:7.4f}\n').format
coords = atoms._getCoords()
for i, xyz in enumerate(coords):
write(format(hetero[i], i+1, atomnames[i], altlocs[i],
resnames[i], chainids[i], int(resnums[i]),
icodes[i], xyz[0], xyz[1], xyz[2], charges[i], radii[i]))
[docs]def writePQR(filename, atoms, **kwargs):
"""Write *atoms* in PQR format to a file with name *filename*. Only
current coordinate set is written. Returns *filename* upon success. If
*filename* ends with :file:`.gz`, a compressed file will be written."""
stream = openFile(filename, 'w')
writePQRStream(stream, atoms, **kwargs)
stream.close()
return filename