dlEMG is designed to generate energetic molecules by using deep learning. If you use the dlEMG, please cite:
Gaokeng Xiao.dlEMG: Deep Learning Based Energetic Molecule Generator. Guangzhou Molcalx Information & Technology Ltd.
import sys
sys.path.append('/public/gkxiao/work/QBMG')
切换到工作目录
#chage workding directory to QBMG
import os
os.chdir('/public/gkxiao/work/QBMG')
#print the current working directory
#it should be: /public/gkxiao/work/QBMG
os.getcwd()
从模型中采样
import torch
from torch.utils.data import DataLoader
from rdkit import Chem
from rdkit import rdBase
import data_struct as ds
from data_struct import MolData, Vocabulary
from model import RNN
import sys
def Sample(filename, enumerate_number):
voc = Vocabulary(init_from_file="./data/zinc_f_voc")
Prior = RNN(voc)
print("Prior RNN model:",filename,"output number:", enumerate_number)
# Can restore from a saved RNN
Prior.rnn.load_state_dict(torch.load(filename))
totalsmiles = set()
enumerate_number = int(enumerate_number)
molecules_total = 0
for epoch in range(1, 10000):
seqs, likelihood, _ = Prior.sample(enumerate_number)
valid = 0
for i, seq in enumerate(seqs.cpu().numpy()):
smile = voc.decode(seq)
if Chem.MolFromSmiles(smile):
valid += 1
totalsmiles.add(smile)
molecules_total = len(totalsmiles)
# summary the information
#print(("\n{:>4.1f}% valid SMILES".format(100 * valid / len(seqs))))
#print(valid, molecules_total, epoch)
if molecules_total > enumerate_number:
break
return totalsmiles
计算氧平衡值
#OB score
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
def obscore(struct):
m = Chem.MolFromSmiles(struct)
if m is None:
pass
m = Chem.AddHs(m)
wt = round(Descriptors.MolWt(m),2)
NNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[n,N]')))
ONum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[o,O]')))
CNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[c,C]')))
FNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[F]')))
ClNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[Cl]')))
BrNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[Br]')))
INum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[I]')))
HNum = len(m.GetSubstructMatches(Chem.MolFromSmarts('[H]')))
np = round(100*NNum*14/wt,2)
OB = round(1600*(ONum-2*CNum-0.5*(HNum - FNum - ClNum - BrNum - INum))/wt ,2)
return struct.strip(),wt,NNum,np,OB
#print("%s %s %s %s %s" %(struct.strip(),str(wt),str(NNum),str(np),str(OB)))
选择模型
考察了训练集化合物对结构生成的影响:一组多含有20个硝基化合物,另一组没有。问题:它们会怎么影响结果?测试表明:虽然老师只用了100多个该化合物调教学生,但是可以看到确实可以生成与训练集类似的化合物;尤其是是通过硝基化合物的加入,生成了更多的含硝基的化合物。这说明模型是可靠的,训练什么东西就得到什么东西。
#Date: 2019-02-26
#Author: Gaokeng Xiao
#Description: model trained with 145 molecules which includes some nitro- compounds
filename = '/public/gkxiao/work/QBMG/data/20190226_epochs_transfer.ckpt'
#model trained with 128 molecules with ob value > -50.
#filename = '/public/gkxiao/work/QBMG/data/energy_100_epochs_transfer.ckpt'
设定生成的分子数,并打印所用的模型
#the number of molecules to be generated
product=[]
enumerate_number = 100
totalsmiles = Sample(filename,enumerate_number)
打印分子,及其性质:最后一列为氮氧平衡值
#print("SMILES Wt #N N% OB")
for structure in totalsmiles:
molnew = obscore(structure)
product.append(molnew)
#product
#print("SMILES Wt #N N% OB")
#product=[]
#for structure in totalsmiles:
# product.append(obscore(structure))
# print(structure)
import pandas as pd
df = pd.DataFrame(data=product)
df.columns = ['Structure', 'MolWt', 'nCount','nPercent','obBalance']
#df.sort_values(by="obBalance" , ascending=False)
df.sort_values(by="obBalance",ascending= False)
df['obBalance'].max()
df['obBalance'].min()
df.describe()
df2 = df.loc[df.obBalance >= -50].sort_values(by="obBalance",ascending= False)
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors
from matplotlib import pyplot as plt
%matplotlib inline
mols = []
for smiles in df2['Structure']:
mol = Chem.MolFromSmiles(smiles)
if mol:
mols.append(mol)
Draw.MolsToGridImage(mols, molsPerRow=5)