几种回归算法以及一些衡量指标的源码

Published: by Creative Commons Licence

基础设置

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error, mean_absolute_error
import time

衡量指标

各指标的计算并打印。

n = 10000 # 每个方法的循环次数
tr = 0.2 # test rate
R2scores = np.zeros((n, 4))

def scores(y_test, y_hat):
  # mean square error
  MSE = mean_squared_error(y_test, y_hat)
  # root mean square error
  RMSE = np.sqrt(MSE)
  # mean absolute error
  MAE = mean_absolute_error(y_test, y_hat)
  # mean absolute percetage error
  MAPE = mean_absolute_percentage_error(y_test, y_hat)
  # r2 score: the proportion of the variance in the dependent variable that is predictable from the independent variable(s).
  R2 = r2_score(y_test, y_hat)
  return MSE, RMSE, MAE, MAPE, R2

def print_scores(method, scores_record):
  scores = scores_record.mean(axis=0)
  print(method, '的各指标')
  print('\tMSE:', scores[0].round(3))
  print('\tRMSE:', scores[1].round(3))
  print('\tMAE:', scores[2].round(3))
  print('\tMAPE:', scores[3].round(3))
  print('\tR2:', scores[4].round(3))
  # print('\tR2:', scores[4], "\tR2 std:", np.std(scores_record[:,3], ddof=1))

Supported Vector Regression (SVR)

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

def svr_predict(train_data, train_label, test_data):
  svr = SVR(kernel='rbf', C=100, gamma=0.001, epsilon=0.1)
  svr.fit(train_data, train_label)
  predict = svr.predict(test_data)
  return predict

start = time.time()

scores_record1 = np.zeros((n, 5))
for i in range(0, n):
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = tr)
  y_hat = svr_predict(x_train, y_train, x_test)
  ss = scores(y_test, y_hat)
  # print(y_test, '\n', y_hat)
  scores_record1[i,:] = ss[0], ss[1], ss[2], ss[3], ss[4]

end = time.time()
R2scores[:,0] = scores_record1[:,4]
print_scores('SVR', scores_record1)
print("\tRunning time:", end - start)

Multiple Linear Regression (MLR)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np

from operator import mul
def mul_lr_predict(x_train, y_train, x_test):
  linreg = LinearRegression()
  model = linreg.fit(x_train, y_train)
  y_hat = linreg.predict(x_test)
  return y_hat

start = time.time()

scores_record2 = np.zeros((n, 5))
for i in range(0, n):
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = tr)
  y_hat = mul_lr_predict(x_train, y_train, x_test)
  ss = scores(y_test, y_hat)
  scores_record2[i,:] = ss[0], ss[1], ss[2], ss[3], ss[4]

end = time.time()
R2scores[:,1] = scores_record2[:,4]

print_scores('MLR', scores_record2)
print("\tRunning time:", end - start)

Gaussian Process Regression (GPR)

import numpy as np
import matplotlib.pyplot as plt
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.gaussian_process import GaussianProcessRegressor

def gpr_predict(x_train, y_train, x_test):
  # kernel = C(1.0, (1e-3, 1e3)) * RBF(10, (0.5, 2))
  # gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=9)
  gp = GaussianProcessRegressor(n_restarts_optimizer=9)
  gp.fit(x_train, y_train)
  means, sigmas = gp.predict(x_test, return_std=True)
  y_hat = means
  return y_hat

start = time.time()

scores_record3 = np.zeros((n, 5))
for i in range(0, n):
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = tr)
  y_hat = gpr_predict(x_train, y_train, x_test)
  ss = scores(y_test, y_hat)
  scores_record3[i,:] = ss[0], ss[1], ss[2], ss[3], ss[4]

end = time.time()
R2scores[:,2] = scores_record3[:,4]

print_scores('GPR', scores_record3)
print("\tRunning time:", end - start)

Random Forest Regression (RFR)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

def rfr_predict(x_train, y_train, x_test):
  regressor = RandomForestRegressor(n_estimators=60, random_state=0)
  regressor.fit(x_train, y_train)
  y_hat = regressor.predict(x_test)
  return y_hat

start = time.time()

scores_record4 = np.zeros((n, 5))
for i in range(0, n):
  x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = tr)
  y_hat = rfr_predict(x_train, y_train, x_test)
  ss = scores(y_test, y_hat)
  scores_record4[i,:] = ss[0], ss[1], ss[2], ss[3], ss[4]

end = time.time()
R2scores[:,3] = scores_record4[:,4]

print_scores('RFR', scores_record4)
print("\tRunning time:", end - start)

不同方法各指标的对比

源码以及图示输出

df = pd.DataFrame(R2scores, columns=['SVR', 'MLR', 'GPR', 'RFR'])
# print(np.std(R2scores[:, 3]))
print(scores_record4[:,4])
print(df.describe().round(3))
plt.figure()
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
labels = 'SVR', 'MLR', 'GPR', 'RFR'
plt.boxplot(R2scores[:], labels = labels, showfliers=False)
plt.ylabel('R2 scores')
plt.show()
plt.figure()
ax = plt.gca()
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
methods = 'SVR', 'MLR', 'GPR', 'RFR'
x = [30.74, 14.74, 36.48, 753.44]
plt.bar(methods, x)
plt.ylabel('running time (s)')

for a,b in zip(methods, x):
  plt.text(a, b+0.05, '%.2f' % b, ha='center', va='bottom', fontsize=12)
  
plt.show()