dlEMG: Deep Learning Based Energetic Molecule Generator

dlEMG is designed to generate energetic molecules by using deep learning. If you use the dlEMG, please cite:

Gaokeng Xiao.dlEMG: Deep Learning Based Energetic Molecule Generator. Guangzhou Molcalx Information & Technology Ltd.

In [1]:
import sys
sys.path.append('/public/gkxiao/work/QBMG')

切换到工作目录

In [2]:
#chage workding directory to QBMG
import os
os.chdir('/public/gkxiao/work/QBMG')

#print the current working directory
#it should be: /public/gkxiao/work/QBMG
os.getcwd()
Out[2]:
'/public/gkxiao/work/QBMG'

从模型中采样

In [3]:
import torch
from torch.utils.data import DataLoader
from rdkit import Chem
from rdkit import rdBase
import data_struct as ds
from data_struct import MolData, Vocabulary
from model import RNN
import sys


def Sample(filename, enumerate_number):
    voc = Vocabulary(init_from_file="./data/zinc_f_voc")
    Prior = RNN(voc)
    print("Prior RNN model:",filename,"output number:", enumerate_number)
    # Can restore from a saved RNN
    Prior.rnn.load_state_dict(torch.load(filename))
    totalsmiles = set()
    enumerate_number = int(enumerate_number)
    molecules_total = 0
    for epoch in range(1, 10000):
        seqs, likelihood, _ = Prior.sample(enumerate_number)
        valid = 0
        for i, seq in enumerate(seqs.cpu().numpy()):
            smile = voc.decode(seq)
            if Chem.MolFromSmiles(smile):
                valid += 1
                totalsmiles.add(smile)

        molecules_total = len(totalsmiles)
        # summary the information 
        #print(("\n{:>4.1f}% valid SMILES".format(100 * valid / len(seqs))))
        #print(valid, molecules_total, epoch)
        if molecules_total > enumerate_number:
            break
    return totalsmiles

计算氧平衡值

In [4]:
#OB score
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
def obscore(struct):
    m = Chem.MolFromSmiles(struct)
    if m is None:
        pass
    m = Chem.AddHs(m)
    wt = round(Descriptors.MolWt(m),2)
    NNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[n,N]')))
    ONum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[o,O]')))
    CNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[c,C]')))
    FNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[F]')))
    ClNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[Cl]')))
    BrNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[Br]')))
    INum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[I]')))
    HNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[H]')))
    np = round(100*NNum*14/wt,2)
    OB = round(1600*(ONum-2*CNum-0.5*(HNum - FNum - ClNum - BrNum - INum))/wt ,2)
    return struct.strip(),wt,NNum,np,OB
    #print("%s %s %s %s %s" %(struct.strip(),str(wt),str(NNum),str(np),str(OB)))

选择模型

考察了训练集化合物对结构生成的影响:一组多含有20个硝基化合物,另一组没有。问题:它们会怎么影响结果?测试表明:虽然老师只用了100多个该化合物调教学生,但是可以看到确实可以生成与训练集类似的化合物;尤其是是通过硝基化合物的加入,生成了更多的含硝基的化合物。这说明模型是可靠的,训练什么东西就得到什么东西。

In [5]:
#Date: 2019-02-26
#Author: Gaokeng Xiao
#Description: model trained with 145 molecules  which includes some nitro- compounds
filename = '/public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt'

#model trained with 128 molecules with ob value > -50.
#filename = '/public/gkxiao/work/QBMG/data/energy_100_epochs_transfer.ckpt'

设定生成的分子数,并打印所用的模型

In [39]:
#the number of molecules to be generated
product=[]
Prior RNN model: /public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt output number: 50
In [62]:
enumerate_number = 100
totalsmiles = Sample(filename,enumerate_number)
Prior RNN model: /public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt output number: 100

打印分子,及其性质:最后一列为氮氧平衡值

In [63]:
#print("SMILES Wt #N N% OB")

for structure in totalsmiles:
     molnew = obscore(structure)
     product.append(molnew)
In [64]:
#product
In [65]:
#print("SMILES Wt #N N% OB")
#product=[]
#for structure in totalsmiles:
#    product.append(obscore(structure))
#    print(structure)
In [66]:
import pandas as pd
df = pd.DataFrame(data=product)
df.columns = ['Structure', 'MolWt', 'nCount','nPercent','obBalance']
In [67]:
#df.sort_values(by="obBalance" , ascending=False) 
df.sort_values(by="obBalance",ascending= False)
Out[67]:
Structure MolWt nCount nPercent obBalance
162 O=[N+]([O-])C([O-])([N+](=O)[O-])[N+](=O)[O-] 166.02 3 25.30 48.19
50 OCC([N+](=O)[O-])([N+](=O)[O-])[O-] 151.05 2 18.54 5.30
134 O=[N+]([O-])N=C1NC(=O)N([N+](=O)[O-])N1 190.08 6 44.19 0.00
23 O=[N+]([O-])C1([N+](=O)[O-])NC(=O)N1 162.06 4 34.56 0.00
105 O=c1[nH][nH][n+]([O-])[n+]1[O-] 118.05 4 47.44 0.00
219 Oc1nonc1[N+](=O)[O-] 131.05 3 32.05 -6.10
186 O=[N+]([O-])N=C1NC(=O)N(O)[C@](O)([N+](=O)[O-])N1 236.10 6 35.58 -6.78
179 O=[N+]([O-])C1([N+](=O)[O-])C(N)(O)N1 164.08 4 34.13 -9.75
191 O=C1NC(=O)N([N+](=O)[O-])N1 146.06 4 38.34 -10.95
196 O=[N+]([O-])C1([N+](=O)[O-])CC([N+](=O)[O-])N1 192.09 4 29.15 -16.66
116 O=C([O-])N1N=NC(=O)N1 129.06 4 43.39 -18.60
228 O=[N+]([O-])N=C1N[C@@H](O)[C@@](O)([N+](=O)[O-... 207.10 5 33.80 -19.31
26 O=[N+]([O-])N=C1N[C@@H](O)[N+](=O)N1 162.09 5 43.19 -19.74
214 O=[N+]([O-])N=C1N[C@H](O)C(C(=O)[O-])([N+](=O)... 234.10 5 29.90 -20.50
64 O=[N+]([O-])C1([N+](=O)[O-])Oc2nocc21 187.07 3 22.45 -21.38
51 O=[N+]([O-])N=C1N[C@@H](O)N1O 148.08 4 37.82 -21.61
135 O=[N+]([O-])[N+]([O-])[C@@H]1N[C@@H]([O-])N1 148.08 4 37.82 -21.61
76 O=C1/C(=O)C(O)([N+](=O)[O-])N1 146.06 2 19.17 -21.91
199 O=[N+]([O-])c1cc([N+](=O)[O-])c(O)n[n+]1[O-] 202.08 4 27.71 -23.75
207 O=[N+]([O-])N=C1NC(=O)N1 130.06 4 43.06 -24.60
209 O=C1NC(=O)[C@](O)([N+](=O)[O-])N1 161.07 3 26.08 -24.83
16 O=[N+]([O-])N=C1N[C@@H](O)[C@@](O)(C([N+](=O)[... 322.15 6 26.07 -24.83
121 O=C1NC(=O)C([N+](=O)[O-])(O)N1 161.07 3 26.08 -24.83
171 O=C1NC(=O)[C@@](O)([N+](=O)[O-])N1 161.07 3 26.08 -24.83
169 O=C1[N-]C(=O)C([N+](=O)[O-])S1 161.12 2 17.38 -24.83
85 O=c1[nH]nnc([O-])c1[N+](=O)[O-] 157.06 4 35.66 -25.47
81 O=[N+]([O-])c1n[nH]c2nnnn12 155.08 7 63.19 -25.79
233 O=c1[nH]c[n+][n+]([O-])c1[N+](=O)[O-] 158.07 4 35.43 -30.37
154 O=c1[nH]cnc([O-])[n+]1[N+](=O)[O-] 158.07 4 35.43 -30.37
35 O=c1[nH]ncc([O-])[n+]1[N+](=O)[O-] 158.07 4 35.43 -30.37
... ... ... ... ... ...
127 O=c1cccc([N+](=O)[O-])c(=O)[nH]1 168.11 2 16.66 -95.18
5 O=Cc1c([O-])c(N)[nH]c(=O)c1O 169.12 2 16.56 -99.34
0 Cc1nonc1-c1nc2nonc2[nH]1 192.14 6 43.72 -99.93
201 O=C([O-])c1ccc(O)c([O-])c1[O-] 167.10 0 0.00 -100.54
231 Nc1ncnc([O-])c1O 126.09 3 33.31 -101.51
184 O=[N+]([O-])N=C1N[C@H](O)[C@]2(N)CC([N+](=O)[O... 273.23 6 30.74 -102.48
112 O=c1[nH]ccc([N+](=O)[O-])c1 140.10 2 19.99 -102.78
102 O=c1[nH][nH]c2ncc[n+]([O-])c12 152.11 4 36.82 -105.19
194 O=C1N[C@@H]2[C@@H](O)C(=NO)[C@@H]2N1 157.13 3 26.73 -106.92
138 O=C1N[C@H]2NC(=O)[C@H]2N1 127.10 3 33.04 -107.00
83 O/N=c1cc[nH]c(=O)[nH]1 127.10 3 33.04 -107.00
41 O=C1N[C@H]2NC(=O)[C@@H]2N1 127.10 3 33.04 -107.00
174 Cn1ccc([O-])c1[N+](=O)[O-] 141.11 2 19.84 -107.72
55 O=c1[nH]cncc1[O-] 111.08 2 25.21 -108.03
178 O=C1N[C@@H]([NH3+])[C@H](O)N1 118.12 3 35.56 -108.36
206 Nc1cn[n+]2[nH]cnn12 125.11 6 67.14 -108.70
197 O=c1[nH]cccc1-c1nnnn1O 179.14 5 39.08 -111.64
71 O=c1[nH]ncn1Oc1ccncc1[O-] 193.14 4 28.99 -111.84
137 Nc1c(N)[nH]c(=O)[nH]1 114.11 4 49.08 -112.17
128 Cc1noc2ncnn12 124.10 4 45.12 -116.04
220 COc1ncc(C=O)[nH]c1=O 154.13 2 18.17 -124.57
72 O=C([O-])N1CC2CCC2C1[N+](=O)[O-] 185.16 2 15.12 -125.30
66 O=[N+]([O-])C1([N+](=O)[O-])CC2CCC1C2 186.17 2 15.04 -128.91
212 O=C1c2c(ncn2[O-])c2ccnnc21 187.14 4 29.92 -132.52
49 O=c1[nH]cccc1-c1nncn1O 178.15 4 31.43 -134.72
144 Nc1nc2[nH]ccn2n1 123.12 5 56.86 -136.45
142 Cc1c(N)[nH]c(=O)n1N 128.13 4 43.71 -137.36
99 O=[N+]([O-])C1CC2CC([N+](=O)[O-])CC2C1 200.19 2 13.99 -143.86
21 O=C1N[C@H]2NCCN[C@H]2N1 142.16 4 39.39 -157.57
187 O=[N+]([O-])C1C2CCC2C1 127.14 1 11.01 -182.48

234 rows × 5 columns

In [68]:
df['obBalance'].max()
Out[68]:
48.19
In [69]:
df['obBalance'].min()
Out[69]:
-182.48
In [70]:
df.describe()
Out[70]:
MolWt nCount nPercent obBalance
count 234.000000 234.000000 234.000000 234.000000
mean 169.730385 3.927350 32.802778 -61.368504
std 42.469304 1.455734 11.311179 30.821087
min 110.080000 0.000000 0.000000 -182.480000
25% 143.087500 3.000000 25.232500 -78.240000
50% 160.580000 4.000000 32.540000 -55.780000
75% 187.120000 5.000000 38.807500 -42.730000
max 348.230000 8.000000 73.140000 48.190000
In [71]:
df2 = df.loc[df.obBalance >= -50].sort_values(by="obBalance",ascending= False)
In [72]:
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from matplotlib import pyplot as plt
%matplotlib inline
mols = []
for smiles in df2['Structure']:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mols.append(mol)
Draw.MolsToGridImage(mols, molsPerRow=5)        
Out[72]: