本文共 2786 字,大约阅读时间需要 9 分钟。
Sklearn提供了 VotingRegressor 与 VotingClassifier两个投票方法,两个模型的操作方式相同,采用相同参数,列表中每个模型采用Tuple的结构表示,第
一个元素代表名称,第二个元素代表模型,要保证每个模型必须拥有唯一的名称。 一些模型可能需要一些预处理,可以通过定义管道实现。from sklearn.linear_model import LogisticRegressionfrom sklearn.svm import SVC from sklearn.ensemble import VotingClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler
models = [('lr',LogisticRegression()), ('svm',make_pipeline(StandardScaler(),SVC()))] ensemble = VotingClassifier(estimators=models)
通过voting参数选择软投票或硬投票
models = [('lr',LogisticRegression()),('svm',SVC())]ensemble = VotingClassifier(estimators=models, voting='soft')
定义分类数据集
# test classification dataset from sklearn.datasets import make_classification # define dataset def get_dataset(): X, y = make_classification(n_samples=1000,n_features=20,n_informative=15, n_redundant=5, random_state=2) # summarize the dataset return X,y
使用KNN模型作为基模型演示投票法,每个模型采用不同的邻居值K参数
# get a voting ensemble of modelsdef get_voting(): # define the base models models = list() models.append(('knn1', KNeighborsClassifier(n_neighbors=1))) models.append(('knn3', KNeighborsClassifier(n_neighbors=3))) models.append(('knn5', KNeighborsClassifier(n_neighbors=5))) models.append(('knn7', KNeighborsClassifier(n_neighbors=7))) models.append(('knn9', KNeighborsClassifier(n_neighbors=9))) # define the voting ensemble ensemble = VotingClassifier(estimators=models, voting='hard') return ensemble
创建一个get_models()为创建模型列表进行评
# get a list of models to evaluatedef get_models(): models = dict() models['knn1'] = KNeighborsClassifier(n_neighbors=1) models['knn3'] = KNeighborsClassifier(n_neighbors=3) models['knn5'] = KNeighborsClassifier(n_neighbors=5) models['knn7'] = KNeighborsClassifier(n_neighbors=7) models['knn9'] = KNeighborsClassifier(n_neighbors=9) models['hard_voting'] = get_voting() return models
evaluate_model()函数接收一个模型实例,并以分层10倍交叉验证三次重复的分数列表的形式返回。
# evaluate a give model using cross-validationdef evaluate_model(model, X, y): cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') return scores
报告每个算法的平均性能,还可以创建一个箱形图和须状图来比较每个算法的精度分数分布。
# define datasetX, y = get_dataset()# get the models to evaluatemodels = get_models()# evaluate the models and store resultsresults, names = list(), list()for name, model in models.items(): scores = evaluate_model(model, X, y) results.append(scores) names.append(name) print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))# plot model performance for comparisonpyplot.boxplot(results, labels=names, showmeans=True)pyplot.show()
>knn1 0.873 (0.030)>knn3 0.889 (0.038)>knn5 0.895 (0.031)>knn7 0.899 (0.035)>knn9 0.900 (0.033)>hard_voting 0.902 (0.034)
转载地址:http://qagwi.baihongyu.com/