切换到当前工作目录
import os
os.chdir('/public/gkxiao/work/dude_plus_comparison')
其中glide(glide_score.csv)与gold数据(gold_score.csv)来源于文献[10],AutoDock Vina数据(Eberhardt2021.csv)来源于文献[11],Surflex数据(Cleves2020.csv)来源于文献[5],Dock数据(Mysinger2012.csv)来源于文献[12]
import numpy as np
import pandas as pd
fred = pd.read_csv('fred_score.csv')
hybrid_s = pd.read_csv('hybrid_single_score.csv')
hybrid_m = pd.read_csv('hybrid_m_score.csv')
glide = pd.read_csv('glide_score.csv')
gold = pd.read_csv('gold_score.csv')
vina = pd.read_csv('Eberhardt2021.csv')
cleves = pd.read_csv("Cleves2020.csv")
chaput = pd.read_csv('chaput2016.csv')
dock = pd.read_csv('Mysinger2012.csv')
fred_logauc = fred['adjusted_logauc'].tolist()
hybrid_s_logauc = hybrid_s['adjusted_logauc'].tolist()
hybrid_m_logauc = hybrid_m['adjusted_logauc'].tolist()
glide_logauc = glide['adjusted_logauc'].tolist()
gold_logauc = gold['adjusted_logauc'].tolist()
dock_logauc = dock['logauc'].tolist()
#combine three groups into one array
data_logauc = np.array([fred_logauc, hybrid_s_logauc, hybrid_m_logauc,glide_logauc,gold_logauc,dock_logauc])
六组数据:
0: FRED; 1:HYBRID-S;2:HYBRID-M;3:GLIDE;4:GOLD;5:DOCK
首先用Friedman Test检验在三组或更多组间的同一受试对象是否具有统计学意义上的显著性差异。
如果Friedman test的p值具有统计学显著意义,我们接着进行Nemenyi post-hoc test来精确地决定到底是哪个组显著的区别于其它组。
Friedman Test uses the following null and alternative hypotheses:
The null hypothesis (H0): The mean for each population is equal.
The alternative hypothesis: (Ha): At least one population mean is different from the rest.
from scipy import stats
#进行Friedman假设检验
stats.friedmanchisquare(fred_logauc, hybrid_s_logauc, hybrid_m_logauc,glide_logauc,gold_logauc,dock_logauc)
FriedmanchisquareResult(statistic=94.85093167701848, pvalue=6.418379282405478e-19)
在这个例子里,测试的statistic=94.851,对应的p-value=6.418e-19。因为p-value小于0.05,所以拒绝空假设H0——即六个方法的logAUC均值无差异。换句话说,有充足的证据得出结论:在所采用六个方法的logAUC中,至少有一个具有统计学意义上的显著差异。
接下来用Nemenyi post-hoc test找出到底哪些方法有别于其它。
from scipy import stats
import scikit_posthocs as sp
#perform Nemenyi post-hoc test
sp.posthoc_nemenyi_friedman(data_logauc.T)
0 | 1 | 2 | 3 | 4 | 5 | |
---|---|---|---|---|---|---|
0 | 1.000000 | 0.900000 | 0.293288 | 0.253056 | 0.001000 | 0.106990 |
1 | 0.900000 | 1.000000 | 0.384443 | 0.336687 | 0.001000 | 0.071710 |
2 | 0.293288 | 0.384443 | 1.000000 | 0.900000 | 0.001000 | 0.001000 |
3 | 0.253056 | 0.336687 | 0.900000 | 1.000000 | 0.001000 | 0.001000 |
4 | 0.001000 | 0.001000 | 0.001000 | 0.001000 | 1.000000 | 0.012007 |
5 | 0.106990 | 0.071710 | 0.001000 | 0.001000 | 0.012007 | 1.000000 |
0: FRED; 1:HYBRID-S;2:HYBRID-M;3:GLIDE;4:GOLD;5:DOCK