dlEMG: Deep Learning Based Energetic Molecule Generator

dlEMG is designed to generate energetic molecules by using deep learning. If you use the dlEMG, please cite:

Gaokeng Xiao.dlEMG: Deep Learning Based Energetic Molecule Generator. Guangzhou Molcalx Information & Technology Ltd.

import sys
sys.path.append('/public/gkxiao/work/QBMG')

切换到工作目录

#chage workding directory to QBMG
import os
os.chdir('/public/gkxiao/work/QBMG')

#print the current working directory
#it should be: /public/gkxiao/work/QBMG
os.getcwd()

'/public/gkxiao/work/QBMG'

从模型中采样

import torch
from torch.utils.data import DataLoader
from rdkit import Chem
from rdkit import rdBase
import data_struct as ds
from data_struct import MolData, Vocabulary
from model import RNN
import sys


def Sample(filename, enumerate_number):
    voc = Vocabulary(init_from_file="./data/zinc_f_voc")
    Prior = RNN(voc)
    print("Prior RNN model:",filename,"output number:", enumerate_number)
    # Can restore from a saved RNN
    Prior.rnn.load_state_dict(torch.load(filename))
    totalsmiles = set()
    enumerate_number = int(enumerate_number)
    molecules_total = 0
    for epoch in range(1, 10000):
        seqs, likelihood, _ = Prior.sample(enumerate_number)
        valid = 0
        for i, seq in enumerate(seqs.cpu().numpy()):
            smile = voc.decode(seq)
            if Chem.MolFromSmiles(smile):
                valid += 1
                totalsmiles.add(smile)

        molecules_total = len(totalsmiles)
        # summary the information 
        #print(("\n{:>4.1f}% valid SMILES".format(100 * valid / len(seqs))))
        #print(valid, molecules_total, epoch)
        if molecules_total > enumerate_number:
            break
    return totalsmiles

计算氧平衡值

#OB score
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
def obscore(struct):
    m = Chem.MolFromSmiles(struct)
    if m is None:
        pass
    m = Chem.AddHs(m)
    wt = round(Descriptors.MolWt(m),2)
    NNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[n,N]')))
    ONum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[o,O]')))
    CNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[c,C]')))
    FNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[F]')))
    ClNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[Cl]')))
    BrNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[Br]')))
    INum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[I]')))
    HNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[H]')))
    np = round(100*NNum*14/wt,2)
    OB = round(1600*(ONum-2*CNum-0.5*(HNum - FNum - ClNum - BrNum - INum))/wt ,2)
    return struct.strip(),wt,NNum,np,OB
    #print("%s %s %s %s %s" %(struct.strip(),str(wt),str(NNum),str(np),str(OB)))

选择模型

考察了训练集化合物对结构生成的影响：一组多含有20个硝基化合物，另一组没有。问题：它们会怎么影响结果？测试表明：虽然老师只用了100多个该化合物调教学生，但是可以看到确实可以生成与训练集类似的化合物；尤其是是通过硝基化合物的加入，生成了更多的含硝基的化合物。这说明模型是可靠的，训练什么东西就得到什么东西。

#Date： 2019-02-26
#Author: Gaokeng Xiao
#Description: model trained with 145 molecules  which includes some nitro- compounds
filename = '/public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt'

#model trained with 128 molecules with ob value > -50.
#filename = '/public/gkxiao/work/QBMG/data/energy_100_epochs_transfer.ckpt'

设定生成的分子数，并打印所用的模型

#the number of molecules to be generated
product=[]

Prior RNN model: /public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt output number: 50

enumerate_number = 100
totalsmiles = Sample(filename,enumerate_number)

Prior RNN model: /public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt output number: 100

打印分子，及其性质：最后一列为氮氧平衡值

#print("SMILES Wt #N N% OB")

for structure in totalsmiles:
     molnew = obscore(structure)
     product.append(molnew)

#product

#print("SMILES Wt #N N% OB")
#product=[]
#for structure in totalsmiles:
#    product.append(obscore(structure))
#    print(structure)

import pandas as pd
df = pd.DataFrame(data=product)
df.columns = ['Structure', 'MolWt', 'nCount','nPercent','obBalance']

#df.sort_values(by="obBalance" , ascending=False) 
df.sort_values(by="obBalance",ascending= False)

df['obBalance'].max()

48.19

df['obBalance'].min()

-182.48

df.describe()

df2 = df.loc[df.obBalance >= -50].sort_values(by="obBalance",ascending= False)

from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from matplotlib import pyplot as plt
%matplotlib inline
mols = []
for smiles in df2['Structure']:
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        mols.append(mol)
Draw.MolsToGridImage(mols, molsPerRow=5)

	Structure	MolWt	nCount	nPercent	obBalance
162	O=[N+]([O-])C([O-])([N+](=O)[O-])[N+](=O)[O-]	166.02	3	25.30	48.19
50	OCC([N+](=O)[O-])([N+](=O)[O-])[O-]	151.05	2	18.54	5.30
134	O=[N+]([O-])N=C1NC(=O)N([N+](=O)[O-])N1	190.08	6	44.19	0.00
23	O=[N+]([O-])C1([N+](=O)[O-])NC(=O)N1	162.06	4	34.56	0.00
105	O=c1[nH][nH][n+]([O-])[n+]1[O-]	118.05	4	47.44	0.00
219	Oc1nonc1[N+](=O)[O-]	131.05	3	32.05	-6.10
186	O=[N+]([O-])N=C1NC(=O)N(O)[C@](O)([N+](=O)[O-])N1	236.10	6	35.58	-6.78
179	O=[N+]([O-])C1([N+](=O)[O-])C(N)(O)N1	164.08	4	34.13	-9.75
191	O=C1NC(=O)N([N+](=O)[O-])N1	146.06	4	38.34	-10.95
196	O=[N+]([O-])C1([N+](=O)[O-])CC([N+](=O)[O-])N1	192.09	4	29.15	-16.66
116	O=C([O-])N1N=NC(=O)N1	129.06	4	43.39	-18.60
228	O=[N+]([O-])N=C1N[C@@H](O)[C@@](O)([N+](=O)[O-...	207.10	5	33.80	-19.31
26	O=[N+]([O-])N=C1N[C@@H](O)[N+](=O)N1	162.09	5	43.19	-19.74
214	O=[N+]([O-])N=C1N[C@H](O)C(C(=O)[O-])([N+](=O)...	234.10	5	29.90	-20.50
64	O=[N+]([O-])C1([N+](=O)[O-])Oc2nocc21	187.07	3	22.45	-21.38
51	O=[N+]([O-])N=C1N[C@@H](O)N1O	148.08	4	37.82	-21.61
135	O=[N+]([O-])[N+]([O-])[C@@H]1N[C@@H]([O-])N1	148.08	4	37.82	-21.61
76	O=C1/C(=O)C(O)([N+](=O)[O-])N1	146.06	2	19.17	-21.91
199	O=[N+]([O-])c1cc([N+](=O)[O-])c(O)n[n+]1[O-]	202.08	4	27.71	-23.75
207	O=[N+]([O-])N=C1NC(=O)N1	130.06	4	43.06	-24.60
209	O=C1NC(=O)[C@](O)([N+](=O)[O-])N1	161.07	3	26.08	-24.83
16	O=[N+]([O-])N=C1N[C@@H](O)[C@@](O)(C([N+](=O)[...	322.15	6	26.07	-24.83
121	O=C1NC(=O)C([N+](=O)[O-])(O)N1	161.07	3	26.08	-24.83
171	O=C1NC(=O)[C@@](O)([N+](=O)[O-])N1	161.07	3	26.08	-24.83
169	O=C1[N-]C(=O)C([N+](=O)[O-])S1	161.12	2	17.38	-24.83
85	O=c1[nH]nnc([O-])c1[N+](=O)[O-]	157.06	4	35.66	-25.47
81	O=[N+]([O-])c1n[nH]c2nnnn12	155.08	7	63.19	-25.79
233	O=c1[nH]c[n+][n+]([O-])c1[N+](=O)[O-]	158.07	4	35.43	-30.37
154	O=c1[nH]cnc([O-])[n+]1[N+](=O)[O-]	158.07	4	35.43	-30.37
35	O=c1[nH]ncc([O-])[n+]1[N+](=O)[O-]	158.07	4	35.43	-30.37
...	...	...	...	...	...
127	O=c1cccc([N+](=O)[O-])c(=O)[nH]1	168.11	2	16.66	-95.18
5	O=Cc1c([O-])c(N)[nH]c(=O)c1O	169.12	2	16.56	-99.34
0	Cc1nonc1-c1nc2nonc2[nH]1	192.14	6	43.72	-99.93
201	O=C([O-])c1ccc(O)c([O-])c1[O-]	167.10	0	0.00	-100.54
231	Nc1ncnc([O-])c1O	126.09	3	33.31	-101.51
184	O=[N+]([O-])N=C1N[C@H](O)[C@]2(N)CC([N+](=O)[O...	273.23	6	30.74	-102.48
112	O=c1[nH]ccc([N+](=O)[O-])c1	140.10	2	19.99	-102.78
102	O=c1[nH][nH]c2ncc[n+]([O-])c12	152.11	4	36.82	-105.19
194	O=C1N[C@@H]2[C@@H](O)C(=NO)[C@@H]2N1	157.13	3	26.73	-106.92
138	O=C1N[C@H]2NC(=O)[C@H]2N1	127.10	3	33.04	-107.00
83	O/N=c1cc[nH]c(=O)[nH]1	127.10	3	33.04	-107.00
41	O=C1N[C@H]2NC(=O)[C@@H]2N1	127.10	3	33.04	-107.00
174	Cn1ccc([O-])c1[N+](=O)[O-]	141.11	2	19.84	-107.72
55	O=c1[nH]cncc1[O-]	111.08	2	25.21	-108.03
178	O=C1N[C@@H]([NH3+])[C@H](O)N1	118.12	3	35.56	-108.36
206	Nc1cn[n+]2[nH]cnn12	125.11	6	67.14	-108.70
197	O=c1[nH]cccc1-c1nnnn1O	179.14	5	39.08	-111.64
71	O=c1[nH]ncn1Oc1ccncc1[O-]	193.14	4	28.99	-111.84
137	Nc1c(N)[nH]c(=O)[nH]1	114.11	4	49.08	-112.17
128	Cc1noc2ncnn12	124.10	4	45.12	-116.04
220	COc1ncc(C=O)[nH]c1=O	154.13	2	18.17	-124.57
72	O=C([O-])N1CC2CCC2C1[N+](=O)[O-]	185.16	2	15.12	-125.30
66	O=[N+]([O-])C1([N+](=O)[O-])CC2CCC1C2	186.17	2	15.04	-128.91
212	O=C1c2c(ncn2[O-])c2ccnnc21	187.14	4	29.92	-132.52
49	O=c1[nH]cccc1-c1nncn1O	178.15	4	31.43	-134.72
144	Nc1nc2[nH]ccn2n1	123.12	5	56.86	-136.45
142	Cc1c(N)[nH]c(=O)n1N	128.13	4	43.71	-137.36
99	O=[N+]([O-])C1CC2CC([N+](=O)[O-])CC2C1	200.19	2	13.99	-143.86
21	O=C1N[C@H]2NCCN[C@H]2N1	142.16	4	39.39	-157.57
187	O=[N+]([O-])C1C2CCC2C1	127.14	1	11.01	-182.48

	MolWt	nCount	nPercent	obBalance
count	234.000000	234.000000	234.000000	234.000000
mean	169.730385	3.927350	32.802778	-61.368504
std	42.469304	1.455734	11.311179	30.821087
min	110.080000	0.000000	0.000000	-182.480000
25%	143.087500	3.000000	25.232500	-78.240000
50%	160.580000	4.000000	32.540000	-55.780000
75%	187.120000	5.000000	38.807500	-42.730000
max	348.230000	8.000000	73.140000	48.190000