Import all the Python modules that we are going to use throughout the script.
import os
import sys
if (sys.platform == 'win32'):
path_to_self = None
try:
path_to_pyflare = os.path.dirname(sys.executable)
except:
pass
if (path_to_pyflare and ('PATH' in os.environ)
and (not path_to_pyflare in os.environ['PATH'].split(os.pathsep))):
os.environ['PATH'] = path_to_pyflare + os.pathsep + os.environ['PATH']
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import rdmolops, Descriptors, \
rdDepictor, rdMolDescriptors, rdFMCS, Lipinski, Crippen
from rdkit.Chem.rdMolDescriptors import Properties
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from cresset import flare
import urllib.request
from qtconsole.qt import QtGui
import warnings
warnings.filterwarnings('ignore')
from chembl_webresource_client.new_client import new_client;
warnings.filterwarnings('default')
Define a function to download from ChEMBL all ligands with a Ki value for a certain target using chembl_webresource_client
.
def download_chembl_ligands_ki(target_name):
target = new_client.target
activity = new_client.activity
res = target.filter(target_synonym__icontains=target_name)[0]
return activity.filter(
target_chembl_id=res['target_chembl_id'], assay_type='B').filter(
standard_type="Ki")
Define a function to download a PDB structure with ID code
from the Protein Data Bank into the Flare project project
.
def download_from_pdb(project, code):
url = 'http://files.rcsb.org/view/{0:s}.pdb'.format(code)
response = urllib.request.urlopen(url)
text = response.read().decode('utf-8')
project.proteins.extend(flare.read_string(text, 'pdb'))
Define a function to count the number of Lipinski's rule-of-5 violations in the RDKit molecule mol
.
def num_ro5_violations(mol):
n = 0
if (Lipinski.NumHDonors(mol) > 5):
n += 1
if (Lipinski.NumHAcceptors(mol) > 10):
n += 1
if (not (Descriptors.MolWt(mol) < 500)):
n += 1
if (not (Crippen.MolLogP(mol) < 5)):
n += 1
return n
Define a function which returns an RDKit Chem.Mol
if the latter passes our filters, None
if it doesn't.
def get_rdkit_mol_from_chembl_mol(chembl_mol):
try:
float(chembl_mol['pchembl_value'])
except:
return None
if (chembl_mol['relation'] != '='):
return None
mol = Chem.MolFromSmiles(chembl_mol['canonical_smiles'])
if (not mol):
return None
mols = list(rdmolops.GetMolFrags(mol, asMols=True))
if (not mols):
return None
mol = max(mols, key = lambda m: m.GetNumAtoms())
molWt = Descriptors.MolWt(mol)
if (num_ro5_violations(mol) > 0 or molWt < 300 or molWt > 400):
return None
return mol
Define a function to take a screenshot of the Flare window.
def take_screenshot(fmw, filename):
if (not fmw):
return
dw = QtGui.QApplication.desktop()
rect = QtGui.QApplication.desktop().frameGeometry()
rect = fmw.widget().geometry()
pixmap = QtGui.QPixmap.grabWindow(dw.winId())
x_ratio = pixmap.size().width() / dw.size().width()
y_ratio = pixmap.size().height() / dw.size().height()
rect.setLeft(rect.left() * x_ratio)
rect.setRight(rect.right() * x_ratio)
rect.setTop(rect.top() * y_ratio)
rect.setBottom(rect.bottom() * y_ratio)
pixmap = pixmap.copy(rect)
try:
filepath = flare_ipynb_dir()
except:
filepath = os.getcwd()
pixmap.save(os.path.join(filepath, filename))
Download all ChEMBL acetylcholinesterase inhibitors endowed with a Ki value.
chembl_mols = download_chembl_ligands_ki('acetylcholinesterase')
Let's see how many acetylcholinesterase inhibitors were retrieved from ChEMBL.
len(chembl_mols)
Create a new project.
If we are running in Flare GUI, swap the Project
in main_window
with the one we have just created.
fmw = flare.main_window()
p = flare.Project()
if (fmw):
fmw.project = p
Initialise smiles_set
to rule out duplicates.
smiles_set = set()
Get RDKit ComputedProperties
names.
props = Properties()
n_vec = props.GetPropertyNames()
Loop over all chembl_mols
retrieved from ChEMBL.
Filter out all mols that do not satisfy our criteria.
If any mol is a duplicate of a molecule we have seen before, skip it.
Set the molecule name to the molecule_chembl_id
field.
Assign stereochemistry (R
/S
, E
/Z
).
Compute a few RDKit properties.
Skip the mol if it has >5 rotatable bonds.
Set RDKit properties on the mol prefixed with the RDK
tag.
Copy across selected ChEMBL properties.
Compute 2D coordinates for the mol.
Append it to Flare's ligand table.
Append it to the rdk_mols
list.
rdk_mols = []
for chembl_mol in chembl_mols:
mol = get_rdkit_mol_from_chembl_mol(chembl_mol)
if (mol is None):
continue
chembl_properties_to_retain = ['document_chembl_id',
'document_journal', 'document_year', 'ligand_efficiency',
'molecule_chembl_id', 'molecule_pref_name',
'parent_molecule_chembl_id', 'pchembl_value']
smiles = Chem.MolToSmiles(mol)
if (smiles in smiles_set):
continue
smiles_set.add(smiles)
mol.SetProp('_Name', chembl_mol['molecule_chembl_id'])
rdmolops.AssignStereochemistry(mol)
p_vec = props.ComputeProperties(mol)
too_flexible = False
for name, value in zip(n_vec, p_vec):
if (name == 'NumRotatableBonds' and value > 5):
too_flexible = True
break
mol.SetProp('RDK{0:s}'.format(name), '{0:.3f}'.format(value))
if (too_flexible):
continue
for chembl_prop_name in chembl_properties_to_retain:
mol.SetProp(chembl_prop_name.upper(),
str(chembl_mol[chembl_prop_name]))
rdDepictor.Compute2DCoords(mol)
p.ligands.append(mol)
rdk_mols.append(mol)
Let's see how the first ligand looks like.
rdk_mols[0]
Let's see how the first 9 ligands look like.
MolsToGridImage(rdk_mols[:9])
Loop over all ligands in Flare's ligand table.
Generate a 3D structure for each ligand and quickly optimise it.
Add field points.
Set field points to be visible in the GUI.
for l in p.ligands:
l.minimize()
l.add_fields()
l.style.field_points_visible = True
Select the third ligand, center and zoom on it.
if (fmw):
fmw.selected_ligands = [p.ligands[2]]
fmw.camera.center_on(fmw.selected_ligands)
fmw.camera.fit_to_window(fmw.selected_ligands)
This is how the Flare window looks like now: all ligands loaded, converted to 3D and endowed with field points.
take_screenshot(fmw, 'Figure2.png')
Let's define a couple of boilerplate matrix classes.
class Matrix:
def __init__(self, rowNum, colNum):
self.rowNum = rowNum
self.colNum = colNum
self.dataSize = rowNum * colNum
self.data = [0] * self.dataSize
def index(self, y, x):
if ((x >= self.colNum) or (y >= self.rowNum)):
raise IndexError('index out of bounds')
i = y * self.colNum + x
return i
def get_cell(self, y, x):
return self.data[self.index(y, x)]
def set_cell(self, y, x, v):
self.data[self.index(y, x)] = v
class TMatrix(Matrix):
def __init__(self, rowNum):
self.size = rowNum
self.dataSize = int((rowNum + 1) * rowNum / 2)
self.data = [0] * self.dataSize
def index(self, y, x):
if ((x >= self.size) or (y >= self.size)):
raise IndexError('index out of bounds')
if (x < y):
x, y = y, x
i = y * (self.size - 1) + int(y * (1 - y) / 2) + x
return i
Initialize an upper-triangular matrix with the number of molecules that we have.
sim_matrix = TMatrix(len(rdk_mols))
Compute Morgan fingerprints with radius 2 for each molecule.
fps = [rdMolDescriptors.GetMorganFingerprint(m, 2) for m in rdk_mols]
Fill the all-against-all upper-triangular similarity matrix with Tanimoto similarity values.
for y in range(len(rdk_mols)):
for x in range(y, len(rdk_mols)):
sim_matrix.set_cell(y, x,
DataStructs.TanimotoSimilarity(fps[y], fps[x]))
This is the 0-based index of the selected ligand.
sel_ligand_idx = 0
if (fmw):
sel_ligand_idx = p.ligands.index(fmw.selected_ligands[0])
sel_ligand_idx
Sort ligands by decreasing similarity to the selected ligand.
sim = sorted([(sim_matrix.get_cell(sel_ligand_idx, i), i) for i in range(
len(fps)) if i != sel_ligand_idx], reverse=True)
Check similarity and indices of the top-3 most similar ligands.
sim[:3]
Select the 3 most similar ligands to the selected one.
if (fmw):
fmw.selected_ligands = fmw.selected_ligands + [
p.ligands[i] for _, i in sim[:3]]
Enable grid view mode.
if (fmw):
fmw.camera.grid_enabled = True
Get selected ligands as RDKit mols.
sel_ligands = [p.ligands[0]]
if (fmw):
sel_ligands = fmw.selected_ligands
sel_rdk_mols = [Chem.RemoveHs(l.to_rdmol()) for l in sel_ligands]
Compute the MCS.
mcs_result = None
try:
mcs_result = rdFMCS.FindMCS(sel_rdk_mols, timeout = 10)
except:
pass
Create a query mol corresponding to the MCS.
mcs_query = None
if (mcs_result):
mcs_query = Chem.MolFromSmarts(mcs_result.smartsString)
Pick all atoms in the Flare viewport corresponding to the MCS.
picked_atoms = []
if (mcs_query):
for i, mol in enumerate(sel_rdk_mols):
match = mol.GetSubstructMatch(mcs_query)
picked_atoms += [sel_ligands[i].atoms[m] for m in match]
if (fmw):
fmw.picked_atoms = picked_atoms
This is how the Flare window looks like now: four ligands displayed in grid mode, their MCS highlighted by a glowing halo.
take_screenshot(fmw, 'Figure3.png')
Disable grid view mode.
if (fmw):
fmw.camera.grid_enabled = False
Unpick atoms and unselect ligands.
if (fmw):
fmw.picked_atoms = []
fmw.selected_ligands = []
Download 1EVE
from the Protein Data Bank.
download_from_pdb(p, '1eve')
Carry out protein preparation on all proteins (there is only one, anyway) capping chains with ACE
and NME
groups if there are any missing residues.
Wait for protein preparation to be finished.
prep = flare.ProteinPrep()
prep.proteins = p.proteins
prep.cap_chains = True
prep.start()
prep.wait()
Extract all sequences which are of Ligand
type.
ligand_sequences = [s for s in p.proteins[
0].sequences if s.type == flare.Sequence.Type.Ligand]
Check the type of ligand_sequences
.
ligand_sequences
Make sure we have a single ligand sequence.
assert(len(ligand_sequences) == 1)
Extract the only residue in the ligand sequence, which corresponds to the ligand itself.
co_xx_ligand_residue = ligand_sequences[0]
Prepend co_xx_ligand_residue
to the ligand list, such that the co-crystallised ligand will become the first ligand in Flare's ligand table.
p.ligands.insert(0, co_xx_ligand_residue)
Check how many ligands we have.
len(p.ligands)
Dock all ligands but the co-crystallized one to the protein, setting the docking grid in correspondence of the co-crystallized ligand.
dock = flare.Docking()
dock.ligands = p.ligands[1:]
dock.protein = p.proteins[0]
dock.system.grid = co_xx_ligand_residue.atoms
dock.start()
dock.wait()
This is how the Flare window looks like now: all ligands have poses through which it is possible to browse.
take_screenshot(fmw, 'Figure4.png')
Let's do a scatter plot using data from the ligand table.
try:
%matplotlib inline
except:
pass
Plot RDKCrippenClogP
against PCHEMBL_VALUE
(i.e, the pKi)
x = []
y = []
for l in p.ligands:
try:
pchembl_value = float(l.properties['PCHEMBL_VALUE'].value)
except:
continue
try:
alogp = float(l.properties['RDKCrippenClogP'].value)
except:
continue
y.append(pchembl_value)
x.append(alogp)
Add a linear regression trendline to the scatterplot.
z = np.polyfit(x, y, 1)
pz = np.poly1d(z)
plt.scatter(x, y); plt.plot(x, pz(x), 'r--');
Compute a few statistics about this regression.
slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
Check the r2
r_value**2
There is no significant correlation between Crippen ClogP and the pKi.
This notebook shows how to deal with a ChEMBL dataset in a completely automated fashion, including taking screenshots and doing plots for reporting.