Author: Gaokeng Xiao

Date:2019-02-06

Data Set: USA EPI. EPI Suite Data http://esc.syrres.com/interkow/EPiSuiteData.htm (accessed Nov 12, 2018).

In [1]:
#Pandas and Numpy
import pandas as pd
import numpy as np
 
#RDkit for fingerprinting and cheminformatics
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, rdMolDescriptors,Descriptors
from rdkit.Chem.EState import Fingerprinter 
 
#Keras for deep learning
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.callbacks import ReduceLROnPlateau
from keras.regularizers import l2 as l2_reg
from keras.optimizers import SGD
 
from sklearn.preprocessing import StandardScaler

 
#Matplotlib for plotting
from matplotlib import pyplot as plt
/public/apps/miniconda3/envs/rdkit/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

读入数据

In [2]:
#Read the data
train = pd.read_csv("/public/gkxiao/work/rdkit_deeplearn/bp_train.csv",sep=',')
test = pd.read_csv("/public/gkxiao/work/rdkit_deeplearn/bp_test.csv",sep=',')

预览数据

In [169]:
train.head()
Out[169]:
ID SMILES BP-Measured Type
0 1 C=O -19.1 Training
1 2 c1c2ccccc2c2ccc3cccc4ccc1c2c34 495.0 Training
2 3 OC(=O)c1cc(Cl)ccc1Cl 301.0 Training
3 4 C[C@H](N)Cc1ccccc1 203.0 Training
4 5 CCOC(N)=O 185.0 Training

计算描述符,此处没有对描述符进行归一化处理,只是目测将分子量除以500之后大体在数据分布上是一致的。建议在研究中应该进行数据归一化与特征变量的优选研究。

In [4]:
#Define a helper function to append the Molweight to the E-state indices
def fps_plus_mw(mol):
    return np.append(Fingerprinter.FingerprintMol(mol)[0],round(Descriptors.MolWt(mol),4)/500)
#Add some new columns
train['Mol'] = train['SMILES'].apply(Chem.MolFromSmiles)
train['Descriptors'] = train['Mol'].apply(fps_plus_mw)
test['Mol'] = test['SMILES'].apply(Chem.MolFromSmiles)
test['Descriptors'] = test['Mol'].apply(fps_plus_mw)

预览数据

In [172]:
train.head()
Out[172]:
ID SMILES BP-Measured Type Mol Descriptors
0 1 C=O -19.1 Training <rdkit.Chem.rdchem.Mol object at 0x7f6f8dbc93f0> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...
1 2 c1c2ccccc2c2ccc3cccc4ccc1c2c34 495.0 Training <rdkit.Chem.rdchem.Mol object at 0x7f6f8dbc9210> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2 3 OC(=O)c1cc(Cl)ccc1Cl 301.0 Training <rdkit.Chem.rdchem.Mol object at 0x7f6f8dbc9030> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3 4 C[C@H](N)Cc1ccccc1 203.0 Training <rdkit.Chem.rdchem.Mol object at 0x7f6f8dbc9260> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ...
4 5 CCOC(N)=O 185.0 Training <rdkit.Chem.rdchem.Mol object at 0x7f6f8dbc9d00> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ...

将数据转为Numpy数组格式

In [173]:
#Convert to Numpy arrays
X_train = np.array(list(train['Descriptors']))
y_train = train['BP-Measured'].values
X_test = np.array(list(test['Descriptors']))
y_test = test['BP-Measured'].values 

建立深度学习网络

In [49]:
model = Sequential()
#model.add(Dense(output_dim=10, input_dim=X_train.shape[1]))
model.add(Dense(input_dim=X_train.shape[1],units=50))
model.add(Activation("sigmoid"))

#model.add(Dense(output_dim=1))
model.add(Dense(units=1))
model.add(Activation("linear"))

训练模型

In [50]:
model.compile(loss='mean_squared_error', optimizer=SGD(lr=0.001, momentum=0.9, nesterov=True))
 
history = model.fit(X_train, y_train, epochs=20000, batch_size=200)
y_pred = model.predict(X_test)
rms = (np.mean((y_test.reshape(-1,1) - y_pred)**2))**0.5
#s = np.std(y_test -y_pred)
print("Neural Network RMS", rms)
Epoch 1/20000
4074/4074 [==============================] - 0s 77us/step - loss: 13177.3301
Epoch 2/20000
4074/4074 [==============================] - 0s 15us/step - loss: 3416.6138
Epoch 3/20000
4074/4074 [==============================] - 0s 16us/step - loss: 2323.6583
...
4074/4074 [==============================] - 0s 13us/step - loss: 89.0846
Epoch 19999/20000
4074/4074 [==============================] - 0s 12us/step - loss: 89.4495
Epoch 20000/20000
4074/4074 [==============================] - 0s 12us/step - loss: 88.7203
Neural Network RMS 18.97270923138163

回顾训练情况

In [51]:
plt.plot(history.history["loss"], label="Loss")
plt.xlabel('Epoch')
plt.ylabel('Neural Network RMS')
plt.yscale("log")
plt.legend()
Out[51]:
<matplotlib.legend.Legend at 0x7f6f7c138ba8>

保存训练过程

In [52]:
import pickle
file=open("/public/gkxiao/work/rdkit_deeplearn/bp_keras_model_history.pkl","wb")
pickle.dump(history.history, file)
file.close
Out[52]:
<function BufferedWriter.close>

保存模型

In [53]:
#save model
model.save('/public/gkxiao/work/rdkit_deeplearn/bp_keras_model.h5')
model.save_weights('/public/gkxiao/work/rdkit_deeplearn/bp_keras_model_weights.h5')

绘制训练集与测试集的预测值与观察值散点图

In [66]:
import matplotlib.pyplot as plt
plt.scatter(y_train,model.predict(X_train), label = 'Train', c='blue')
plt.title('Neural Network Predictor')
plt.xlabel('Measured Boiling Point')
plt.ylabel('Predicted Boiling Point')
plt.scatter(y_test,model.predict(X_test),c='lightgreen', label='Test', alpha = 0.8)
plt.legend(loc=4)
plt.show()

模型质量评估

In [174]:
from sklearn.metrics import mean_squared_error,r2_score
y_pred=model.predict(X_train)
train_mse=mean_squared_error(y_train,y_pred)
train_r2 = r2_score(y_train,y_pred)
test_pred=model.predict(X_test)
test_mse=mean_squared_error(y_test,test_pred)
test_r2 = r2_score(y_test,test_pred)
print("Train set: MSE=",round(train_mse,2),"R^2=",round(train_r2,2))
print("Test set: MSE=",round(test_mse,2),"R^2=",round(test_r2,2))
Train set: MSE= 86.97 R^2= 0.99
Test set: MSE= 359.96 R^2= 0.95

预测新化合物的沸点

In [9]:
#Structure (SMILES code)  of new compound
smi = 'FC(F)(F)C=CC(F)(F)F'
#create a prediction set and calculate descriptor
predset=pd.DataFrame({'SMILES':[smi]})
predset['Mol'] = predset['SMILES'].apply(Chem.MolFromSmiles)
predset['Descriptors'] = predset['Mol'].apply(fps_plus_mw)
#load model
from keras.models import load_model
model = load_model("/public/gkxiao/work/rdkit_deeplearn/bp_keras_model.h5")
#predict boiling point
predset['Pred_BP'] = model.predict(np.array(list(predset['Descriptors'])))[0][0]
In [10]:
predset
Out[10]:
SMILES Mol Descriptors Pred_BP
0 FC(F)(F)C=CC(F)(F)F <rdkit.Chem.rdchem.Mol object at 0x7fc270b79df0> [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 0.128204
In [11]:
print("Boiling point:", round(model.predict(np.array(list(predset['Descriptors'])))[0][0],2),"℃")
Boiling point: 0.13 ℃