# 套路开头
from __future__ import division, print_function, unicode_literals
import numpy as np
import os
np.random.seed(42)
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# 代理设置
import socket
import socks
SOCKS5_PROXY_HOST = '127.0.0.1'
SOCKS5_PROXY_PORT = 1086
default_socket = socket.socket
socks.set_default_proxy(socks.SOCKS5, SOCKS5_PROXY_HOST, SOCKS5_PROXY_PORT)
socket.socket = socks.socksocket
# 获取 mnist
from six.moves import urllib
from sklearn.datasets import fetch_mldata
try:
mnist = fetch_mldata('MNIST original')
except urllib.error.HTTPError as ex:
print("Could not download MNIST data from mldata.org, trying alternative...")
from scipy.io import loadmat
mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
print("Success!")
K折交叉验证,使用一个模型对其中一折进行预测,对其他折进行训练
X, y = mnist["data"], mnist["target"]
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# 数据预处理完成
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv = 3, scoring = 'accuracy')
$$\begin{bmatrix} TN &FP\\ FN &TP \end{bmatrix}$$
引入一个新的概念,叫做准确率 precision :
$$precision = \frac{TP}{TP+FP}$$
同时还有一个叫做召回率 recall ,或者真正比例 true positive rate :
$$recall = \frac{TP}{TP+FN}$$
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
# 交叉验证方式预测训练集
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3)
# 传递目标类和预测列将生成一个混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
# 计算正确率和召回率
from sklearn.metrics import precision_score, recall_score
# 准确率 判断结果的正确率
print(precision_score(y_train_5, y_train_pred))
# 召回率 正例被正确探测的比例
print(recall_score(y_train_5, y_train_pred))
通常结合准确率和召回率会更方便。这个指标是 F1 值。
$$F1=\frac{2}{\frac{1}{precision}+\frac{1}{recall}}=2*\frac{precision*recall}{precision+recall}=\frac{TP}{TP+\frac{FN+FP}{2}}$$
# 我们可以通过阈值控制预测结果。因此我们如何选择阈值呢?
# 使用 cross_val_predict 得到每一个样例的分数值,但是这一次指定返回一个决策分数,而不是预测值
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv = 3, method = "decision_function")
print(y_scores)
# 现在有了这些分数,对于任何可能的阈值,使用 precision_recall_curve 都可以计算准确率和召回率了
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)
# 现在画出 准确率 - 召回率 的曲线
def plot_precision_vs_recall(precisions, recalls):
plt.plot(recalls, precisions, 'r:', linewidth = 2)
plt.xlabel('Recall', fontsize = 16)
plt.ylabel('Precision', fontsize = 16)
plt.axis([0, 1, 0, 1])
plt.figure(figsize = (8, 6))
plot_precision_vs_recall(precisions, recalls)
可以发现在 recall 0.8 处,准确率急剧下降,在之前选择一个点,比如 recall 0.6 的位置。
# 最后用 matplotlib 画出准确率和召回率,这里把准确率和召回率当做是阈值的一个函数
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
plt.plot(thresholds, precisions[:-1], 'b--', label = 'Precision')
plt.plot(thresholds, recalls[:-1], 'g:', label = 'Recall')
plt.xlabel('Threshold')
plt.legend(loc = 'upper left')
plt.ylim([0, 1])
plt.figure(figsize = (8, 6))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
# 阈值的选择取决于项目的需求
y_train_pred_90 = (y_scores > -1000)
a = precision_score(y_train_5, y_train_pred_90)
b = recall_score(y_train_5, y_train_pred_90)
print(a)
print(b)
ROC 曲线它非常类似于 准确率/召回率 曲线,但不是画出准确地准确率对召回率的曲线
再次观察混淆矩阵:
$$\begin{bmatrix} TN &FP\\ FN &TP \end{bmatrix}$$
$$ROC=\frac{TPR}{FPR}=\frac{recall}{1-TNR}$$
$$TPR=recall=\frac{TP}{TP+FN}$$
$$TNR=\frac{TN}{FP+TN}$$
$$FPR= \frac{FP}{FP+TN}$$
不难理解了。
# 为了画出 ROC 曲线,首先我们要计算 tpr、fpr ,使用 roc_curve 函数
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
plt.plot(fpr, tpr, linewidth=2, label=label)
plt.plot([0, 1], [0, 1], 'k--')
plt.axis([0, 1, 0, 1])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plot_roc_curve(fpr, tpr)
plt.show()
# 折衷问题再次出现,随着 recall 上升, fpr 也在上升,好的 roc 曲线应该远离对角线
# 一个判断 roc 优劣的方法是,计算面积,越完美越接近1,纯随机的面积是0.5。 称为 auc 方法。
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_5, y_scores)
创建一个可以将图片分成10类的系统的方法是:
sklearn 会自耦东探测出你想使用一个二分类器去完成多分类的任务,它会自动地执行 ova ,除了 svm 分类器,它将使用 ovo
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])
some_digit = X[36000]
some_digit_scores = sgd_clf.decision_function([some_digit])
print(some_digit_scores)
# 我们也可以强制使用 ovo 的策略。
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state = 42, max_iter=5, tol=None))
ovo_clf.fit(X_train, y_train)
print(ovo_clf.predict([some_digit]))
len(ovo_clf.estimators_)