Question

我需要确定不同的分类模式如何预测价值。为了做到这一点,我需要制定一个注重成果的年度报告,但我正在努力制定一项办法。

我包括我的整部字典以及所使用的数据集一的链接。这似乎像许多法典一样,但实际上是简单的。发现的主要问题是,我有3x3个混淆矩阵,并且有决心知道如何将这种混为一谈。

非常感谢任何帮助。

数据集:

https://archive.ics.uci.edu/ml/ organne- Learning-databases/wine-quality/

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import seaborn as sns
import numpy as np

#data = pd.read_csv( wineQualityReds.csv , usecols=lambda x:  Unnamed  not in x,)
data = pd.read_csv( wineQualityWhites.csv , usecols=lambda x:  Unnamed  not in x,)

# roc curve and auc score
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color= orange , label= ROC )
    plt.plot([0, 1], [0, 1], color= darkblue , linestyle= -- )
    plt.xlabel( False Positive Rate )
    plt.ylabel( True Positive Rate )
    plt.title( Receiver Operating Characteristic (ROC) Curve )
    plt.legend()
    plt.show()

bins = [1,4,6,10]

quality_labels = [0,1,2]

data[ quality_categorial ] = pd.cut(data[ quality ], bins = bins, labels  = quality_labels, include_lowest = True)

display(data.head(n=2))

quality_raw = data[ quality_categorial ]
features_raw = data.drop([ quality ,  quality_categorial ], axis = 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features_raw, quality_raw, test_size = 0.2, random_state = 0)

from sklearn.metrics import fbeta_score
from sklearn.metrics import accuracy_score

def train_predict_evaluate(learner, sample_size, X_train, y_train, X_test, y_test):
    results = {}

    #start = time()
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    #end = time()

    #results[ train_time ] = end - start

    #start = time()
    predictions_train = learner.predict(X_train[:300])
    predictions_test = learner.predict(X_test)

    #end = time()

    #results[ pred_time ] = end - start

    results[ acc_train ] = accuracy_score(y_train[:300], predictions_train)

    results[ acc_test ] = accuracy_score(y_test, predictions_test)

    results[ f_train ] = fbeta_score(y_train[:300], predictions_train, beta  = 0.5, average =  micro )

    results[ f_test ] = fbeta_score(y_test, predictions_test, beta = 0.5, average =  micro )

    #####################
    #array = print(confusion_matrix(y_test, predictions_test))
    labels = [ Positives , Negatives ]
    cm = confusion_matrix(y_test, predictions_test)
    print(cm)

    df_cm = pd.DataFrame(cm, columns=np.unique(y_test), index = np.unique(y_test))
    df_cm.index.name =  Actual 
    df_cm.columns.name =  Predicted 


    plt.figure(figsize = (10,7))
    sns.set(font_scale=1.4)#for label size
    sns.heatmap(df_cm, cmap="Blues", annot=True, fmt =  g ,annot_kws={"size": 16})# font size

    #######################

    print(predictions_test)
    #auc = roc_auc_score(y_test, probs)
    #print( AUC: %.2f  % auc)

    #fpr, tpr, thresholds = roc_curve(y_test, probs)
    #plot_roc_curve(fpr, tpr)


    print("{} trained on {} samples." .format(learner.__class__.__name__, sample_size))

    return results

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

clf_A = GaussianNB()
clf_B = DecisionTreeClassifier(max_depth=None, random_state=None)
clf_C = RandomForestClassifier(max_depth=None, random_state=None)

samples_100 = len(y_train)
samples_10 = int(len(y_train)*10/100)
samples_1 = int(len(y_train)*1/100)

results = {}
for clf in [clf_A,clf_B,clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = 
        train_predict_evaluate(clf, samples, X_train, y_train, X_test, y_test)

train_predict_evaluate(clf_C, samples_100, X_train, y_train, X_test, y_test)

Answer 1

我们不能直接从混淆矩阵中计算辐射科的曲线,因为“哥伦比亚大气中大气中大气”是一种在不同门槛条件下的分类问题的业绩计量。

下面的法典适用于我:

def plot_roc(model, X_test, y_test):
    # calculate the fpr and tpr for all thresholds of the classification
    probabilities = model.predict_proba(np.array(X_test))
    predictions = probabilities[:, 1]
    fpr, tpr, threshold = metrics.roc_curve(y_test, predictions)
    roc_auc = metrics.auc(fpr, tpr)

    plt.title( Receiver Operating Characteristic )
    plt.plot(fpr, tpr,  b , label= AUC = %0.2f  % roc_auc)
    plt.legend(loc= lower right )
    plt.plot([0, 1], [0, 1],  r-- )
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel( True Positive Rate )
    plt.xlabel( False Positive Rate )
    plt.show()

友情链接