Import all the Python modules that we are going to use throughout the script.

import os
import sys
if (sys.platform == 'win32'):
    path_to_self = None
    try:
        path_to_pyflare = os.path.dirname(sys.executable)
    except:
        pass
    if (path_to_pyflare and ('PATH' in os.environ)
        and (not path_to_pyflare in os.environ['PATH'].split(os.pathsep))):
        os.environ['PATH'] = path_to_pyflare + os.pathsep + os.environ['PATH']
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import rdmolops, Descriptors, \
    rdDepictor, rdMolDescriptors, rdFMCS, Lipinski, Crippen
from rdkit.Chem.rdMolDescriptors import Properties
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from cresset import flare
import urllib.request
from qtconsole.qt import QtGui
import warnings
warnings.filterwarnings('ignore')
from chembl_webresource_client.new_client import new_client;
warnings.filterwarnings('default')

Define a function to download from ChEMBL all ligands with a Ki value for a certain target using chembl_webresource_client.

def download_chembl_ligands_ki(target_name):
    target = new_client.target
    activity = new_client.activity
    res = target.filter(target_synonym__icontains=target_name)[0]
    return activity.filter(
        target_chembl_id=res['target_chembl_id'], assay_type='B').filter(
        standard_type="Ki")

Define a function to download a PDB structure with ID code from the Protein Data Bank into the Flare project project.

def download_from_pdb(project, code):
    url = 'http://files.rcsb.org/view/{0:s}.pdb'.format(code)
    response = urllib.request.urlopen(url)
    text = response.read().decode('utf-8')
    project.proteins.extend(flare.read_string(text, 'pdb'))

Define a function to count the number of Lipinski's rule-of-5 violations in the RDKit molecule mol.

def num_ro5_violations(mol):
    n = 0
    if (Lipinski.NumHDonors(mol) > 5):
        n += 1
    if (Lipinski.NumHAcceptors(mol) > 10):
        n += 1
    if (not (Descriptors.MolWt(mol) < 500)):
        n += 1
    if (not (Crippen.MolLogP(mol) < 5)):
        n += 1
    return n

Define a function which returns an RDKit Chem.Mol if the latter passes our filters, None if it doesn't.

def get_rdkit_mol_from_chembl_mol(chembl_mol):
    try:
        float(chembl_mol['pchembl_value'])
    except:
        return None
    if (chembl_mol['relation'] != '='):
        return None
    mol = Chem.MolFromSmiles(chembl_mol['canonical_smiles'])
    if (not mol):
        return None
    mols = list(rdmolops.GetMolFrags(mol, asMols=True))
    if (not mols):
        return None
    mol = max(mols, key = lambda m: m.GetNumAtoms())
    molWt = Descriptors.MolWt(mol)
    if (num_ro5_violations(mol) > 0 or molWt < 300 or molWt > 400):
        return None
    return mol

Define a function to take a screenshot of the Flare window.

def take_screenshot(fmw, filename):
    if (not fmw):
        return
    dw = QtGui.QApplication.desktop()
    rect = QtGui.QApplication.desktop().frameGeometry()
    rect = fmw.widget().geometry()
    pixmap = QtGui.QPixmap.grabWindow(dw.winId())
    x_ratio = pixmap.size().width() / dw.size().width()
    y_ratio = pixmap.size().height() / dw.size().height()
    rect.setLeft(rect.left() * x_ratio)
    rect.setRight(rect.right() * x_ratio)
    rect.setTop(rect.top() * y_ratio)
    rect.setBottom(rect.bottom() * y_ratio)
    pixmap = pixmap.copy(rect)
    try:
        filepath = flare_ipynb_dir()
    except:
        filepath = os.getcwd()
    pixmap.save(os.path.join(filepath, filename))

Download all ChEMBL acetylcholinesterase inhibitors endowed with a K_i value.

chembl_mols = download_chembl_ligands_ki('acetylcholinesterase')

Let's see how many acetylcholinesterase inhibitors were retrieved from ChEMBL.

len(chembl_mols)

1747

Create a new project.
If we are running in Flare GUI, swap the Project in main_window with the one we have just created.

fmw = flare.main_window()
p = flare.Project()
if (fmw):
    fmw.project = p

Initialise smiles_set to rule out duplicates.

smiles_set = set()

Get RDKit ComputedProperties names.

props = Properties()
n_vec = props.GetPropertyNames()

Loop over all chembl_mols retrieved from ChEMBL.
Filter out all mols that do not satisfy our criteria.
If any mol is a duplicate of a molecule we have seen before, skip it.
Set the molecule name to the molecule_chembl_id field.
Assign stereochemistry (R/S, E/Z).
Compute a few RDKit properties.
Skip the mol if it has >5 rotatable bonds.
Set RDKit properties on the mol prefixed with the RDK tag.
Copy across selected ChEMBL properties.
Compute 2D coordinates for the mol.
Append it to Flare's ligand table.
Append it to the rdk_mols list.

rdk_mols = []
for chembl_mol in chembl_mols:
    mol = get_rdkit_mol_from_chembl_mol(chembl_mol)
    if (mol is None):
        continue
    chembl_properties_to_retain = ['document_chembl_id',
        'document_journal', 'document_year', 'ligand_efficiency',
        'molecule_chembl_id', 'molecule_pref_name',
        'parent_molecule_chembl_id', 'pchembl_value']
    smiles = Chem.MolToSmiles(mol)
    if (smiles in smiles_set):
        continue
    smiles_set.add(smiles)
    mol.SetProp('_Name', chembl_mol['molecule_chembl_id'])
    rdmolops.AssignStereochemistry(mol)
    p_vec = props.ComputeProperties(mol)
    too_flexible = False
    for name, value in zip(n_vec, p_vec):
        if (name == 'NumRotatableBonds' and value > 5):
            too_flexible = True
            break
        mol.SetProp('RDK{0:s}'.format(name), '{0:.3f}'.format(value))
    if (too_flexible):
        continue
    for chembl_prop_name in chembl_properties_to_retain:
        mol.SetProp(chembl_prop_name.upper(),
                    str(chembl_mol[chembl_prop_name]))
    rdDepictor.Compute2DCoords(mol)
    p.ligands.append(mol)
    rdk_mols.append(mol)

Let's see how the first ligand looks like.

rdk_mols[0]

Let's see how the first 9 ligands look like.

MolsToGridImage(rdk_mols[:9])

Loop over all ligands in Flare's ligand table.
Generate a 3D structure for each ligand and quickly optimise it.
Add field points.
Set field points to be visible in the GUI.

for l in p.ligands:
    l.minimize()
    l.add_fields()
    l.style.field_points_visible = True

Select the third ligand, center and zoom on it.

if (fmw):
    fmw.selected_ligands = [p.ligands[2]]
    fmw.camera.center_on(fmw.selected_ligands)
    fmw.camera.fit_to_window(fmw.selected_ligands)

This is how the Flare window looks like now: all ligands loaded, converted to 3D and endowed with field points.

take_screenshot(fmw, 'Figure2.png')

Let's define a couple of boilerplate matrix classes.

class Matrix:
    def __init__(self, rowNum, colNum):
        self.rowNum = rowNum
        self.colNum = colNum
        self.dataSize = rowNum * colNum
        self.data = [0] * self.dataSize
    def index(self, y, x):
        if ((x >= self.colNum) or (y >= self.rowNum)):
            raise IndexError('index out of bounds')
        i = y * self.colNum + x
        return i
    def get_cell(self, y, x):
        return self.data[self.index(y, x)]
    def set_cell(self, y, x, v):
        self.data[self.index(y, x)] = v

class TMatrix(Matrix):
    def __init__(self, rowNum):
        self.size = rowNum
        self.dataSize = int((rowNum + 1) * rowNum / 2)
        self.data = [0] * self.dataSize
    def index(self, y, x):
        if ((x >= self.size) or (y >= self.size)):
            raise IndexError('index out of bounds')
        if (x < y):
            x, y = y, x
        i = y * (self.size - 1) + int(y * (1 - y) / 2) + x
        return i

Initialize an upper-triangular matrix with the number of molecules that we have.

sim_matrix = TMatrix(len(rdk_mols))

Compute Morgan fingerprints with radius 2 for each molecule.

fps = [rdMolDescriptors.GetMorganFingerprint(m, 2) for m in rdk_mols]

Fill the all-against-all upper-triangular similarity matrix with Tanimoto similarity values.

for y in range(len(rdk_mols)):
    for x in range(y, len(rdk_mols)):
        sim_matrix.set_cell(y, x,
                            DataStructs.TanimotoSimilarity(fps[y], fps[x]))

This is the 0-based index of the selected ligand.

sel_ligand_idx = 0
if (fmw):
    sel_ligand_idx = p.ligands.index(fmw.selected_ligands[0])

sel_ligand_idx

2

Sort ligands by decreasing similarity to the selected ligand.

sim = sorted([(sim_matrix.get_cell(sel_ligand_idx, i), i) for i in range(
    len(fps)) if i != sel_ligand_idx], reverse=True)

Check similarity and indices of the top-3 most similar ligands.

sim[:3]

[(0.85, 14), (0.8192771084337349, 3), (0.7831325301204819, 1)]

Select the 3 most similar ligands to the selected one.

if (fmw):
    fmw.selected_ligands = fmw.selected_ligands + [
        p.ligands[i] for _, i in sim[:3]]

Enable grid view mode.

if (fmw):
    fmw.camera.grid_enabled = True

Get selected ligands as RDKit mols.

sel_ligands = [p.ligands[0]]
if (fmw):
    sel_ligands = fmw.selected_ligands

sel_rdk_mols = [Chem.RemoveHs(l.to_rdmol()) for l in sel_ligands]

Compute the MCS.

mcs_result = None
try:
    mcs_result = rdFMCS.FindMCS(sel_rdk_mols, timeout = 10)
except:
    pass

Create a query mol corresponding to the MCS.

mcs_query = None
if (mcs_result):
    mcs_query = Chem.MolFromSmarts(mcs_result.smartsString)

Pick all atoms in the Flare viewport corresponding to the MCS.

picked_atoms = []
if (mcs_query):
    for i, mol in enumerate(sel_rdk_mols):
        match = mol.GetSubstructMatch(mcs_query)
        picked_atoms += [sel_ligands[i].atoms[m] for m in match]
    if (fmw):
        fmw.picked_atoms = picked_atoms

This is how the Flare window looks like now: four ligands displayed in grid mode, their MCS highlighted by a glowing halo.

take_screenshot(fmw, 'Figure3.png')

Disable grid view mode.

if (fmw):
    fmw.camera.grid_enabled = False

Unpick atoms and unselect ligands.

if (fmw):
    fmw.picked_atoms = []
    fmw.selected_ligands = []

Download 1EVE from the Protein Data Bank.

download_from_pdb(p, '1eve')

Carry out protein preparation on all proteins (there is only one, anyway) capping chains with ACE and NME groups if there are any missing residues.
Wait for protein preparation to be finished.

prep = flare.ProteinPrep()
prep.proteins = p.proteins
prep.cap_chains = True
prep.start()
prep.wait()

Extract all sequences which are of Ligand type.

ligand_sequences = [s for s in p.proteins[
    0].sequences if s.type == flare.Sequence.Type.Ligand]

Check the type of ligand_sequences.

ligand_sequences

[<cresset.flare.Sequence 3 at 0x000001A79EA94EB0>]

Make sure we have a single ligand sequence.

assert(len(ligand_sequences) == 1)

Extract the only residue in the ligand sequence, which corresponds to the ligand itself.

co_xx_ligand_residue = ligand_sequences[0]

Prepend co_xx_ligand_residue to the ligand list, such that the co-crystallised ligand will become the first ligand in Flare's ligand table.

p.ligands.insert(0, co_xx_ligand_residue)

[<cresset.flare.Ligand with ID '159' role '1000' at 0x000001A79E858580>]

Check how many ligands we have.

len(p.ligands)

53

Dock all ligands but the co-crystallized one to the protein, setting the docking grid in correspondence of the co-crystallized ligand.

dock = flare.Docking()
dock.ligands = p.ligands[1:]
dock.protein = p.proteins[0]
dock.system.grid = co_xx_ligand_residue.atoms
dock.start()
dock.wait()

This is how the Flare window looks like now: all ligands have poses through which it is possible to browse.

take_screenshot(fmw, 'Figure4.png')

Let's do a scatter plot using data from the ligand table.

try:
    %matplotlib inline
except:
    pass

Plot RDKCrippenClogP against PCHEMBL_VALUE (i.e, the pK_i)

x = []
y = []
for l in p.ligands:
    try:
        pchembl_value = float(l.properties['PCHEMBL_VALUE'].value)
    except:
        continue
    try:
        alogp = float(l.properties['RDKCrippenClogP'].value)
    except:
        continue
    y.append(pchembl_value)
    x.append(alogp)

Add a linear regression trendline to the scatterplot.

z = np.polyfit(x, y, 1)
pz = np.poly1d(z)

plt.scatter(x, y); plt.plot(x, pz(x), 'r--');

Compute a few statistics about this regression.

slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)

Check the r²

r_value**2

0.09269572651720885

There is no significant correlation between Crippen ClogP and the pK_i.
This notebook shows how to deal with a ChEMBL dataset in a completely automated fashion, including taking screenshots and doing plots for reporting.