Author: Gaokeng Xiao
Date:2019-02-06
Data Set: USA EPI. EPI Suite Data http://esc.syrres.com/interkow/EPiSuiteData.htm (accessed Nov 12, 2018).
#Pandas and Numpy
import pandas as pd
import numpy as np
#RDkit for fingerprinting and cheminformatics
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors,Descriptors
from rdkit.Chem.EState import Fingerprinter
#Keras for deep learning
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import ReduceLROnPlateau
from keras.regularizers import l2 as l2_reg
from keras.optimizers import SGD
from sklearn.preprocessing import StandardScaler
#Matplotlib for plotting
from matplotlib import pyplot as plt
读入数据
#Read the data
train = pd.read_csv("/public/gkxiao/work/rdkit_deeplearn/bp_train.csv",sep=',')
test = pd.read_csv("/public/gkxiao/work/rdkit_deeplearn/bp_test.csv",sep=',')
预览数据
train.head()
计算描述符,此处没有对描述符进行归一化处理,只是目测将分子量除以500之后大体在数据分布上是一致的。建议在研究中应该进行数据归一化与特征变量的优选研究。
#Define a helper function to append the Molweight to the E-state indices
def fps_plus_mw(mol):
return np.append(Fingerprinter.FingerprintMol(mol)[0],round(Descriptors.MolWt(mol),4)/500)
#Add some new columns
train['Mol'] = train['SMILES'].apply(Chem.MolFromSmiles)
train['Descriptors'] = train['Mol'].apply(fps_plus_mw)
test['Mol'] = test['SMILES'].apply(Chem.MolFromSmiles)
test['Descriptors'] = test['Mol'].apply(fps_plus_mw)
预览数据
train.head()
将数据转为Numpy数组格式
#Convert to Numpy arrays
X_train = np.array(list(train['Descriptors']))
y_train = train['BP-Measured'].values
X_test = np.array(list(test['Descriptors']))
y_test = test['BP-Measured'].values
建立深度学习网络
model = Sequential()
#model.add(Dense(output_dim=10, input_dim=X_train.shape[1]))
model.add(Dense(input_dim=X_train.shape[1],units=50))
model.add(Activation("sigmoid"))
#model.add(Dense(output_dim=1))
model.add(Dense(units=1))
model.add(Activation("linear"))
训练模型
model.compile(loss='mean_squared_error', optimizer=SGD(lr=0.001, momentum=0.9, nesterov=True))
history = model.fit(X_train, y_train, epochs=20000, batch_size=200)
y_pred = model.predict(X_test)
rms = (np.mean((y_test.reshape(-1,1) - y_pred)**2))**0.5
#s = np.std(y_test -y_pred)
print("Neural Network RMS", rms)
回顾训练情况
plt.plot(history.history["loss"], label="Loss")
plt.xlabel('Epoch')
plt.ylabel('Neural Network RMS')
plt.yscale("log")
plt.legend()
保存训练过程
import pickle
file=open("/public/gkxiao/work/rdkit_deeplearn/bp_keras_model_history.pkl","wb")
pickle.dump(history.history, file)
file.close
保存模型
#save model
model.save('/public/gkxiao/work/rdkit_deeplearn/bp_keras_model.h5')
model.save_weights('/public/gkxiao/work/rdkit_deeplearn/bp_keras_model_weights.h5')
绘制训练集与测试集的预测值与观察值散点图
import matplotlib.pyplot as plt
plt.scatter(y_train,model.predict(X_train), label = 'Train', c='blue')
plt.title('Neural Network Predictor')
plt.xlabel('Measured Boiling Point')
plt.ylabel('Predicted Boiling Point')
plt.scatter(y_test,model.predict(X_test),c='lightgreen', label='Test', alpha = 0.8)
plt.legend(loc=4)
plt.show()
模型质量评估
from sklearn.metrics import mean_squared_error,r2_score
y_pred=model.predict(X_train)
train_mse=mean_squared_error(y_train,y_pred)
train_r2 = r2_score(y_train,y_pred)
test_pred=model.predict(X_test)
test_mse=mean_squared_error(y_test,test_pred)
test_r2 = r2_score(y_test,test_pred)
print("Train set: MSE=",round(train_mse,2),"R^2=",round(train_r2,2))
print("Test set: MSE=",round(test_mse,2),"R^2=",round(test_r2,2))
预测新化合物的沸点
#Structure (SMILES code) of new compound
smi = 'FC(F)(F)C=CC(F)(F)F'
#create a prediction set and calculate descriptor
predset=pd.DataFrame({'SMILES':[smi]})
predset['Mol'] = predset['SMILES'].apply(Chem.MolFromSmiles)
predset['Descriptors'] = predset['Mol'].apply(fps_plus_mw)
#load model
from keras.models import load_model
model = load_model("/public/gkxiao/work/rdkit_deeplearn/bp_keras_model.h5")
#predict boiling point
predset['Pred_BP'] = model.predict(np.array(list(predset['Descriptors'])))[0][0]
predset
print("Boiling point:", round(model.predict(np.array(list(predset['Descriptors'])))[0][0],2),"℃")