传统分类方法全家桶

因为要对比很多的传统方法,所以写了个小的结构可以方便的增加训练模型,内置十多种分类方法,把需要跑的模型注释解除就可以了

对于别的任务,主要是修改read_data和columns,选定正确的X和Y参数即可

除了xgb比较慢别的都是秒出结果的

import pandas as pd
from sklearn.metrics import roc_auc_score


def read_data(train_file):
    # 读取三个文件
    TrainSelectFeature = pd.read_csv(train_file)
    InternalSelectFeature = pd.read_csv(train_file.replace("TrainSelectFeature", "InternalSelectFeature"))
    ExternalSelectFeature = pd.read_csv(train_file.replace("TrainSelectFeature", "ExternalSelectFeature"))

    TrainSelectFeature = TrainSelectFeature[TrainSelectFeature['StudyID'] != 'Index']
    InternalSelectFeature = InternalSelectFeature[InternalSelectFeature['StudyID'] != 'Index']
    ExternalSelectFeature = ExternalSelectFeature[ExternalSelectFeature['StudyID'] != 'Index']

    return TrainSelectFeature, InternalSelectFeature, ExternalSelectFeature


def train_model(model_class, train_data, internal_data, external_data, **kwargs):
    X_train = train_data.drop(columns=['StudyID', 'ClassifyValue'])
    y_train = train_data['ClassifyValue']

    X_internal = internal_data.drop(columns=['StudyID', 'ClassifyValue'])
    y_internal = internal_data['ClassifyValue']

    X_external = external_data.drop(columns=['StudyID', 'ClassifyValue'])
    y_external = external_data['ClassifyValue']

    model = model_class(**kwargs)

    model.fit(X_train, y_train)

    auc_scores = {}
    for dataset, X, y in [('train', X_train, y_train), ('internal', X_internal, y_internal),
                          ('external', X_external, y_external)]:
        y_pred_proba = model.predict_proba(X)[:, 1]
        auc_scores[dataset] = roc_auc_score(y, y_pred_proba)

    # Print AUC scores
    print(
        f"{auc_scores['train']} {auc_scores['internal']} {auc_scores['external']}")

    return model


from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, \
    ExtraTreesClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

for series in ['A40', 'A70', 'AD', 'AZ', 'V40', 'V70', 'VD', 'VZ']:
    train_data, internal_data, external_data = read_data(f'~/PycharmProjects/swinUMamba/DLTextbook/datasets/raw/rxa_rad/TrainSelectFeature_{series}.csv')

    # svm_model = train_model(SVC, train_data, internal_data, external_data, probability=True, random_state=42)
    rf_model = train_model(RandomForestClassifier, train_data, internal_data, external_data,
                           n_estimators=100,  # Set the number of trees
                           random_state=42  # Set random state for reproducibility
                           )
    # xgb_model = train_model(xgb.XGBClassifier, train_data, internal_data, external_data, eval_metric="logloss",
    #                         random_state=42)
    # lr_model = train_model(LogisticRegression, train_data, internal_data, external_data, random_state=42)
    # knn_model = train_model(KNeighborsClassifier, train_data, internal_data, external_data, n_neighbors=5)
    # dt_model = train_model(DecisionTreeClassifier, train_data, internal_data, external_data, random_state=42)
    # ada_model = train_model(AdaBoostClassifier, train_data, internal_data, external_data, n_estimators=50,
    #                         random_state=42)
    # gb_model = train_model(GradientBoostingClassifier, train_data, internal_data, external_data, n_estimators=100,
    #                        random_state=42)
    # mlp_model = train_model(MLPClassifier, train_data, internal_data, external_data, random_state=42, max_iter=5000)
    # bagging_model = train_model(BaggingClassifier, train_data, internal_data, external_data, random_state=42)
    # et_model = train_model(ExtraTreesClassifier, train_data, internal_data, external_data, n_estimators=100,
    #                        random_state=42)
    # ada_boost_model = train_model(AdaBoostClassifier, train_data, internal_data, external_data, n_estimators=50,
    #                               random_state=42)
    # ridge_model = train_model(RidgeClassifier, train_data, internal_data, external_data, random_state=42)
    # lda_model = train_model(LinearDiscriminantAnalysis, train_data, internal_data, external_data)

发表评论