因为要对比很多的传统方法,所以写了个小的结构可以方便的增加训练模型,内置十多种分类方法,把需要跑的模型注释解除就可以了
对于别的任务,主要是修改read_data和columns,选定正确的X和Y参数即可
除了xgb比较慢别的都是秒出结果的
import pandas as pd
from sklearn.metrics import roc_auc_score
def read_data(train_file):
# 读取三个文件
TrainSelectFeature = pd.read_csv(train_file)
InternalSelectFeature = pd.read_csv(train_file.replace("TrainSelectFeature", "InternalSelectFeature"))
ExternalSelectFeature = pd.read_csv(train_file.replace("TrainSelectFeature", "ExternalSelectFeature"))
TrainSelectFeature = TrainSelectFeature[TrainSelectFeature['StudyID'] != 'Index']
InternalSelectFeature = InternalSelectFeature[InternalSelectFeature['StudyID'] != 'Index']
ExternalSelectFeature = ExternalSelectFeature[ExternalSelectFeature['StudyID'] != 'Index']
return TrainSelectFeature, InternalSelectFeature, ExternalSelectFeature
def train_model(model_class, train_data, internal_data, external_data, **kwargs):
X_train = train_data.drop(columns=['StudyID', 'ClassifyValue'])
y_train = train_data['ClassifyValue']
X_internal = internal_data.drop(columns=['StudyID', 'ClassifyValue'])
y_internal = internal_data['ClassifyValue']
X_external = external_data.drop(columns=['StudyID', 'ClassifyValue'])
y_external = external_data['ClassifyValue']
model = model_class(**kwargs)
model.fit(X_train, y_train)
auc_scores = {}
for dataset, X, y in [('train', X_train, y_train), ('internal', X_internal, y_internal),
('external', X_external, y_external)]:
y_pred_proba = model.predict_proba(X)[:, 1]
auc_scores[dataset] = roc_auc_score(y, y_pred_proba)
# Print AUC scores
print(
f"{auc_scores['train']} {auc_scores['internal']} {auc_scores['external']}")
return model
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, \
ExtraTreesClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
for series in ['A40', 'A70', 'AD', 'AZ', 'V40', 'V70', 'VD', 'VZ']:
train_data, internal_data, external_data = read_data(f'~/PycharmProjects/swinUMamba/DLTextbook/datasets/raw/rxa_rad/TrainSelectFeature_{series}.csv')
# svm_model = train_model(SVC, train_data, internal_data, external_data, probability=True, random_state=42)
rf_model = train_model(RandomForestClassifier, train_data, internal_data, external_data,
n_estimators=100, # Set the number of trees
random_state=42 # Set random state for reproducibility
)
# xgb_model = train_model(xgb.XGBClassifier, train_data, internal_data, external_data, eval_metric="logloss",
# random_state=42)
# lr_model = train_model(LogisticRegression, train_data, internal_data, external_data, random_state=42)
# knn_model = train_model(KNeighborsClassifier, train_data, internal_data, external_data, n_neighbors=5)
# dt_model = train_model(DecisionTreeClassifier, train_data, internal_data, external_data, random_state=42)
# ada_model = train_model(AdaBoostClassifier, train_data, internal_data, external_data, n_estimators=50,
# random_state=42)
# gb_model = train_model(GradientBoostingClassifier, train_data, internal_data, external_data, n_estimators=100,
# random_state=42)
# mlp_model = train_model(MLPClassifier, train_data, internal_data, external_data, random_state=42, max_iter=5000)
# bagging_model = train_model(BaggingClassifier, train_data, internal_data, external_data, random_state=42)
# et_model = train_model(ExtraTreesClassifier, train_data, internal_data, external_data, n_estimators=100,
# random_state=42)
# ada_boost_model = train_model(AdaBoostClassifier, train_data, internal_data, external_data, n_estimators=50,
# random_state=42)
# ridge_model = train_model(RidgeClassifier, train_data, internal_data, external_data, random_state=42)
# lda_model = train_model(LinearDiscriminantAnalysis, train_data, internal_data, external_data)