{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Chapter 3 – Classification**" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# To support both python 2 and python 3\n", "from __future__ import division, print_function, unicode_literals\n", "\n", "# Common imports\n", "import numpy as np\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", "import matplotlib as mpl\n", "import matplotlib.pyplot as plt\n", "mpl.rc('axes', labelsize=14)\n", "mpl.rc('xtick', labelsize=12)\n", "mpl.rc('ytick', labelsize=12)\n", "\n", "# Where to save the figures\n", "PROJECT_ROOT_DIR = \".\"\n", "CHAPTER_ID = \"classification\"\n", "\n", "def save_fig(fig_id, tight_layout=True):\n", " path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n", " print(\"Saving figure\", fig_id)\n", " if tight_layout:\n", " plt.tight_layout()\n", " plt.savefig(path, format='png', dpi=300)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# MNIST" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Download and Read the data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To download the data set\n", "from sklearn.datasets import fetch_openml\n", "import sklearn\n", "import numpy as np\n", "def sort_by_target(mnist):\n", " reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]\n", " reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]\n", " mnist.data[:60000] = mnist.data[reorder_train]\n", " mnist.target[:60000] = mnist.target[reorder_train]\n", " mnist.data[60000:] = mnist.data[reorder_test + 60000]\n", " mnist.target[60000:] = mnist.target[reorder_test + 60000]\n", "print('The scikit-learn version is {}.'.format(sklearn.__version__))\n", "#If scikit-learn version is <0.20, please use conda update scikit-learn to update " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Reading Data\n", "mnist = fetch_openml('mnist_784',version = 1, cache = True)\n", "mnist.target = mnist.target.astype(np.int8)\n", "sort_by_target(mnist)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mnist.data.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X, y = mnist[\"data\"], mnist[\"target\"]\n", "X.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "some_digit = X[36000]\n", "some_digit_image = some_digit.reshape(28, 28)\n", "plt.imshow(some_digit_image, cmap = mpl.cm.binary,\n", " interpolation=\"nearest\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "shuffle_index = np.random.permutation(60000)\n", "X_train, y_train = mnist[\"data\"][:60000], mnist[\"target\"][:60000]\n", "X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train a binary classifier:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_train_5 = (y_train == 5)\n", "y_test_5 = (y_test == 5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import SGDClassifier\n", "sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)\n", "#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html\n", "sgd_clf.fit(X_train, y_train_5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sgd_clf.predict([some_digit])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Performance Measures:" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Measuring Accuracy Using Cross-Validation:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", "cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Implementing corss-validation:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.base import clone\n", "\n", "skfolds = StratifiedKFold(n_splits=3, random_state=42)\n", "\n", "for train_index, test_index in skfolds.split(X_train, y_train_5):\n", " clone_clf = clone(sgd_clf)\n", " X_train_folds = X_train[train_index]\n", " y_train_folds = (y_train_5[train_index])\n", " X_test_fold = X_train[test_index]\n", " y_test_fold = (y_train_5[test_index])\n", "\n", " clone_clf.fit(X_train_folds, y_train_folds)\n", " y_pred = clone_clf.predict(X_test_fold)\n", " n_correct = sum(y_pred == y_test_fold)\n", " print(n_correct / len(y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check another naive classifier:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator\n", "class Never5Classifier(BaseEstimator):\n", " def fit(self, X, y=None):\n", " pass\n", " def predict(self, X):\n", " return np.zeros((len(X), 1), dtype=bool)\n", "never_5_clf = Never5Classifier()\n", "cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring=\"accuracy\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Confusion Matrix: Count the number of times instances of class A are classfied as class B" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_predict\n", "\n", "y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)\n", "#cross_val_predict perform K fold cross-validation and returns the predictions made on each test fold\n", "len(y_train_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import confusion_matrix\n", "\n", "confusion_matrix(y_train_5, y_train_pred)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_train_perfect_predictions = y_train_5\n", "confusion_matrix(y_train_5, y_train_perfect_predictions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Precision: TP/(TP+FP), Recall : TP/(TP+FN)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import precision_score, recall_score\n", "\n", "precision_score(y_train_5, y_train_pred)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "recall_score(y_train_5, y_train_pred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "F_1 score: harmonic mean of precision and recall: 2/(1/pre+1/rec)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import f1_score\n", "f1_score(y_train_5, y_train_pred)" ] }, { "attachments": { "Screenshot%20from%202019-01-27%2017-27-36.png": { "image/png": "" } }, "cell_type": "markdown", "metadata": {}, "source": [ "Precision/Recall Tradeoff:![Screenshot%20from%202019-01-27%2017-27-36.png](attachment:Screenshot%20from%202019-01-27%2017-27-36.png)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_scores = sgd_clf.decision_function([some_digit])\n", "y_scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "threshold = 200000\n", "y_some_digit_pred = (y_scores > threshold)\n", "y_some_digit_pred" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Decide which Threshold to use: " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,\n", " method=\"decision_function\")\n", "#by setting method = \"decision_function\", cross_val_predict returns the decision scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#For all possible threshods:\n", "from sklearn.metrics import precision_recall_curve\n", "#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html\n", "precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):\n", " plt.plot(thresholds, precisions[:-1], \"b--\", label=\"Precision\", linewidth=2)\n", " plt.plot(thresholds, recalls[:-1], \"g-\", label=\"Recall\", linewidth=2)\n", " plt.xlabel(\"Threshold\", fontsize=16)\n", " plt.legend(loc=\"upper left\", fontsize=16)\n", " plt.ylim([0, 1])\n", "\n", "plt.figure(figsize=(8, 4))\n", "plot_precision_recall_vs_threshold(precisions, recalls, thresholds)\n", "plt.xlim([-700000, 700000])\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "95% precision classfier:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_train_pred_90 = (y_scores > 70000)\n", "precision_score(y_train_5, y_train_pred_90)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(recalls, precisions)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another measurement - receiver operating characteristic (ROC): recall/1-specificity , specificity: true negative rate, 1-specificity: false negative rate" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import roc_curve\n", "\n", "fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def plot_roc_curve(fpr, tpr, label=None):\n", " plt.plot(fpr, tpr, linewidth=2, label=label)\n", " plt.plot([0, 1], [0, 1], 'k--')\n", " plt.axis([0, 1, 0, 1])\n", " plt.xlabel('False Positive Rate', fontsize=16)\n", " plt.ylabel('True Positive Rate', fontsize=16)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plot_roc_curve(fpr, tpr)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import roc_auc_score\n", "#area under the curve \n", "roc_auc_score(y_train_5, y_scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Another classfier: RandomForestClassifier()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n", "y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,\n", " method=\"predict_proba\")\n", "#predict_proba method returns an array containing a row per instance and a column per class, \n", "#each containing the probability that the given instance belongs to the given class" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_probas_forest" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class\n", "fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(8, 6))\n", "plt.plot(fpr, tpr, \"b:\", linewidth=2, label=\"SGD\")\n", "plot_roc_curve(fpr_forest, tpr_forest, \"Random Forest\")\n", "plt.legend(loc=\"lower right\", fontsize=16)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "roc_auc_score(y_train_5, y_scores_forest)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multiclass Classification: " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "one to all strategy : train 10 binary classifiers. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sgd_clf.fit(X_train, y_train)\n", "sgd_clf.predict([some_digit])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "some_digit_scores = sgd_clf.decision_function([some_digit]) #10 scores, one per class\n", "some_digit_scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "np.argmax(some_digit_scores)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "one to one strategy: for each pair of digits, one classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.multiclass import OneVsOneClassifier\n", "ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))\n", "ovo_clf.fit(X_train, y_train)\n", "ovo_clf.predict([some_digit])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(ovo_clf.estimators_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "some_digit_scores = ovo_clf.decision_function([some_digit])\n", "some_digit_scores" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#RandomForestClassifier:\n", "forest_clf.fit(X_train, y_train)\n", "forest_clf.predict([some_digit])\n", "forest_clf.predict_proba([some_digit])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "cross-validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring=\"accuracy\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Better\n", "from sklearn.preprocessing import StandardScaler\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) #scaling the inputs\n", "cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring=\"accuracy\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Error Analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)\n", "conf_mx = confusion_matrix(y_train, y_train_pred)\n", "conf_mx" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.matshow(conf_mx, cmap=plt.cm.gray)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "row_sums = conf_mx.sum(axis=1, keepdims=True)\n", "norm_conf_mx = conf_mx / row_sums\n", "np.fill_diagonal(norm_conf_mx, 0)\n", "plt.matshow(norm_conf_mx, cmap=plt.cm.gray)\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Analyse individual errors:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cl_a, cl_b = 3, 5\n", "# GN: rows with True value are selected\n", "X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)] #GN: '3' classified as 3\n", "X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)] #GN: '3' classified as 5\n", "X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)] #GN: '5' classified as 3\n", "X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)] #GN: '5' classified as 5\n", "\n", "def plot_digits(instances, images_per_row=10, **options):\n", " size = 28\n", " images_per_row = min(len(instances), images_per_row)\n", " images = [instance.reshape(size,size) for instance in instances]\n", " n_rows = (len(instances) - 1) // images_per_row + 1\n", " row_images = []\n", " n_empty = n_rows * images_per_row - len(instances)\n", " images.append(np.zeros((size, size * n_empty)))\n", " for row in range(n_rows):\n", " rimages = images[row * images_per_row : (row + 1) * images_per_row]\n", " row_images.append(np.concatenate(rimages, axis=1))\n", " image = np.concatenate(row_images, axis=0)\n", " plt.imshow(image, cmap = mpl.cm.binary, **options)\n", " plt.axis(\"off\")\n", " \n", "plt.figure(figsize=(8,8))\n", "plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)\n", "plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)\n", "plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)\n", "plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)\n", "plt.show()\n", "#SGDclassifier is quite sensitive to image shifting and rotation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multilabel Classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "y_train_large = (y_train >= 7)\n", "y_train_odd = (y_train % 2 == 1)\n", "#https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.c_.html\n", "y_multilabel = np.c_[y_train_large, y_train_odd] #concatenation\n", "\n", "knn_clf = KNeighborsClassifier()\n", "knn_clf.fit(X_train, y_multilabel)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "knn_clf.predict([some_digit])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)\n", "f1_score(y_multilabel, y_train_knn_pred, average=\"macro\")\n", "#This running may take time\n", "# To give more weight to one of the label, set average = \"weighted\"\n", "#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multioutput Classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Multioutput = multilabel + multiclass [multiouput system is not limited to classification problems]\n", "#Build a system that removes noise from images (noisy image -> clean digit image)\n", "#multilabel: one label per pixel\n", "#multiclass: each label can have multiple values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "noise = np.random.randint(0, 100, (len(X_train), 784))\n", "X_train_mod = X_train + noise\n", "noise = np.random.randint(0, 100, (len(X_test), 784))\n", "X_test_mod = X_test + noise\n", "y_train_mod = X_train\n", "y_test_mod = X_test" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "some_index = 5500\n", "def plot_digit(data):\n", " image = data.reshape(28, 28)\n", " plt.imshow(image, cmap = mpl.cm.binary,\n", " interpolation=\"nearest\")\n", " plt.axis(\"off\")\n", "plt.subplot(121); plot_digit(X_test_mod[some_index])\n", "plt.subplot(122); plot_digit(y_test_mod[some_index])\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "knn_clf.fit(X_train_mod, y_train_mod)\n", "clean_digit = knn_clf.predict([X_test_mod[some_index]])\n", "plot_digit(clean_digit)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Extra Material: Random Classifier: " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.dummy import DummyClassifier\n", "dmy_clf = DummyClassifier()\n", "y_probas_dmy = cross_val_predict(dmy_clf, X_train, y_train_5, cv=3, method=\"predict_proba\")\n", "y_scores_dmy = y_probas_dmy[:, 1]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fprr, tprr, thresholdsr = roc_curve(y_train_5, y_scores_dmy)\n", "plot_roc_curve(fprr, tprr)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Questions/Exercises" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1.Try to build a classifier for the MNIST dataset that achieves over 97% accuracy on the test set.\n", "Hint: the KNeighborsClassifier works quite well for this task; you just need to find good hyperparameter values (try a grid search on the weights and n_neighbors hyperparameters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import fetch_openml\n", "import sklearn\n", "import numpy as np\n", "def sort_by_target(mnist):\n", " reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]\n", " reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]\n", " mnist.data[:60000] = mnist.data[reorder_train]\n", " mnist.target[:60000] = mnist.target[reorder_train]\n", " mnist.data[60000:] = mnist.data[reorder_test + 60000]\n", " mnist.target[60000:] = mnist.target[reorder_test + 60000]" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": true }, "outputs": [], "source": [ "print('The scikit-learn version is {}.'.format(sklearn.__version__)) #If scikit-learn version is <0.20, please use conda update scikit-learn to update " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Reading Data\n", "mnist = fetch_openml('mnist_784',version = 1, cache = True)\n", "mnist.target = mnist.target.astype(np.int8)\n", "sort_by_target(mnist)\n", "shuffle_index = np.random.permutation(60000)\n", "X_train, y_train = mnist[\"data\"][:60000], mnist[\"target\"][:60000]\n", "X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Classifier: KNeighborsClassifier()\n", "# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Grid Search: GridSearchCV()\n", "#https://scikit-learn.org/stable/modules/grid_search.html" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2.Data Augmentation\n", "Write a function that can shift an MNIST image in any direction (left, right, up or down) by one pixel. Then for each image in the training set create four shifted copies (one per direction) and add them to the training set. Finally, train your best model on this expanded training set and measure its accuracy on the test set. \n", "\n", "Hint: you can use shift() function from the scipy.ndimage.interpolation module (https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.ndimage.interpolation.shift.html): shift(image, [2,1], cval=0, model=\"constant\") shifts the image 2 pixels down and 1 pixel to the right. Moreover, the points outside the boundaries are filled with \"cval=0\".\n", "\n", "Obersevation: Your model should perform even better! This techinique of artifically growing the training set is called \"data augmentation\" or \"trainig set expansion\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Give a function which takes the image and shift it $dx$ along the x axis and $dy$ along the y axis\n", "def shift_image(image, dx, dy):\n", " image = image.reshape([28,28])\n", " shifted_image = #To fill \n", " return shifted_image.reshape([-1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Test the function, taking one digit as example and plot it\n", "image = X_train[1000]\n", "shifted_left = shift_image(image,-5,0)\n", "plt.imshow(shifted_left.reshape(28,28), interpolation = \"nearest\", cmap=\"Greys\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#For each training image, create four shifted images (one pixel to each direction) and put them into training set.\n", "X_train_augmented = [image for image in X_train]\n", "y_train_augmented = [label for label in y_train]\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Shuffle the new training set\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Train the new training set on the previous best classfier (KNeighborsClassifier) and show the accuracy." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Tackle the Titanic dataset: \n", "The goal is to predict whether a passenger survived based on attributes like age, sex, passenger class etc.\n", "Download the data (https://project.inria.fr/chuan/course_material_classification/) which includes one train.csv and one test.csv. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Load the data into list\n", "import pandas as pd\n", "train_data = pd.read_csv() #To fill the path of train.csv\n", "test_data = pd.read_csv() #To fill the path of test.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To see the details of the data\n", "train_data.head()\n", "test_data.head()\n", "#The meanings of the attributes: https://www.kaggle.com/c/titanic/data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To see how much data is missing\n", "train_data.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Dispaly numerical attributes information \n", "train_data.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Reuse the DataframeSelector from last course to select specific attributes from the DataFrame\n", "from sklearn.base import BaseEstimator, TransformerMixin\n", "class DataFrameSelector(BaseEstimator, TransformerMixin):\n", " def __init__(self, attribute_names):\n", " self.attribute_names = attribute_names\n", " def fit(self, X, y=None):\n", " return self\n", " def transform(self, X):\n", " return X[self.attribute_names]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To fill: Build the pipeline for the numerical attributes \"Age\", \"SibSp\", \"Parch\", \"Fare\"\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "\n", "num_pipeline = Pipeline()\n", "num_pipeline.fit_transform(train_data)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Build the pipeline for the string categorical attributes \"Pclass\", \"Sex\", \"Embarked\"\n", "from sklearn.preprocessing import OneHotEncoder\n", "class MostFrequentImputer(BaseEstimator, TransformerMixin):\n", " def fit(self, X, y=None):\n", " self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],\n", " index=X.columns)\n", " return self\n", " def transform(self, X, y=None):\n", " return X.fillna(self.most_frequent_)\n", "cat_pipeline = Pipeline([\n", " (\"select_cat\", DataFrameSelector([\"Pclass\", \"Sex\", \"Embarked\"])),\n", " (\"imputer\", MostFrequentImputer()),\n", " (\"cat_encoder\", OneHotEncoder(sparse=False)),\n", " ])\n", "cat_pipeline.fit_transform(train_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Join the numerical and categorical pipelines\n", "from sklearn.pipeline import FeatureUnion\n", "preprocess_pipeline = FeatureUnion(transformer_list=[\n", " (\"num_pipeline\", num_pipeline),\n", " (\"cat_pipeline\", cat_pipeline),\n", " ])\n", "X_train = preprocess_pipeline.fit_transform(train_data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To fill: get the labels for the training data\n", "y_train = " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To fill: Choose a binary classifier for training and have a prediction on the test data\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To fill: Use cross-validation to have an idea of how good our model is" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Option: choose another classifier to see if it improves the result" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Note: To improve the result further, you could:\n", "\n", " 1)Compare many more models and tune hyperparameters using cross validation and grid search,\n", " 2)Do more feature engineering, for example:\n", " replace SibSp and Parch with their sum,\n", " try to identify parts of names that correlate well with the Survived attribute \n", " (e.g. if the name contains \"Countess\", then survival seems more likely),\n", " 3)try to convert numerical attributes to categorical attributes: for example, different age groups had very different survival rates (see below), so it may help to create an age bucket category and use it instead of the age. Similarly, it may be useful to have a special category for people traveling alone since only 30% of them survived.\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Build a SPAM Classifier" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Download the examples of spam and ham from Apache SpamAssasin's public datasets. \n", "import os\n", "import tarfile\n", "from six.moves import urllib\n", "\n", "DOWNLOAD_ROOT = \"http://spamassassin.apache.org/old/publiccorpus/\"\n", "HAM_URL = DOWNLOAD_ROOT + \"20030228_easy_ham.tar.bz2\"\n", "SPAM_URL = DOWNLOAD_ROOT + \"20030228_spam.tar.bz2\"\n", "SPAM_PATH = os.path.join(\"datasets\", \"spam\")\n", "\n", "def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):\n", " if not os.path.isdir(spam_path):\n", " os.makedirs(spam_path)\n", " for filename, url in ((\"ham.tar.bz2\", HAM_URL), (\"spam.tar.bz2\", SPAM_URL)):\n", " path = os.path.join(spam_path, filename)\n", " if not os.path.isfile(path):\n", " urllib.request.urlretrieve(url, path)\n", " tar_bz2_file = tarfile.open(path) #Unzip the datasets \n", " tar_bz2_file.extractall(path=SPAM_PATH)\n", " tar_bz2_file.close()\n", "fetch_spam_data()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "HAM_DIR = os.path.join(SPAM_PATH, \"easy_ham\")\n", "SPAM_DIR = os.path.join(SPAM_PATH, \"spam\")\n", "ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]\n", "spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "len(ham_filenames)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#We can use Python's email module to parse these emails (this handles headers, encoding, and so on):\n", "import email\n", "import email.policy\n", "\n", "def load_email(is_spam, filename, spam_path=SPAM_PATH):\n", " directory = \"spam\" if is_spam else \"easy_ham\"\n", " with open(os.path.join(spam_path, directory, filename), \"rb\") as f:\n", " return email.parser.BytesParser(policy=email.policy.default).parse(f)\n", "ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]\n", "spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(ham_emails[1].get_content().strip())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#To fill: split the datasets into a training set and a test set\n", "#Hint: using train_test_split function from sklearn.model_selection \n", "\n", "X_train, X_test, y_train, y_test = " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Email prepocessing - convert HTML to plain text: The following function first drops the section, then converts all tags to the word HYPERLINK, then it gets rid of all HTML tags, leaving only the plain text. For readability, it also replaces multiple newlines with single newlines, and finally it unescapes html entities (such as > or  ):\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import re\n", "from html import unescape\n", "\n", "def html_to_plain_text(html):\n", " text = re.sub('.*?', '', html, flags=re.M | re.S | re.I)\n", " text = re.sub('', ' HYPERLINK ', text, flags=re.M | re.S | re.I)\n", " text = re.sub('<.*?>', '', text, flags=re.M | re.S)\n", " text = re.sub(r'(\\s*\\n)+', '\\n', text, flags=re.M | re.S)\n", " return unescape(text)\n", "\n", "def email_to_text(email):\n", " html = None\n", " for part in email.walk():\n", " ctype = part.get_content_type()\n", " if not ctype in (\"text/plain\", \"text/html\"):\n", " continue\n", " try:\n", " content = part.get_content()\n", " except: # in case of encoding issues\n", " content = str(part.get_payload())\n", " if ctype == \"text/plain\":\n", " return content\n", " else:\n", " html = content\n", " if html:\n", " return html_to_plain_text(html)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(spam_emails[7].get_content().strip())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(html_to_plain_text(spam_emails[7].get_content())[:100], \"...\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(email_to_text(spam_emails[7])[:100], \"...\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**In the following: a data preparation pipeline is done to convert each mail into a feature vector indicating the presence or absence of each possible word. \n", "For example, if all emails only ever contain four words, \"Hello\", \"how\", \"are\", \"you\", then the email \"Hello you Hello Hello you\" would be converted into a vector [1,0,0,1] (existence) or [3,0,0,2] (counting)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", "import urlextract #pip3 install urlextract\n", "from collections import Counter\n", "import nltk #pip3 install nltk \n", "import numpy as np\n", "\n", "class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):\n", " def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,\n", " replace_urls=True, replace_numbers=True, stemming=True):\n", " self.strip_headers = strip_headers\n", " self.lower_case = lower_case\n", " self.remove_punctuation = remove_punctuation\n", " self.replace_urls = replace_urls\n", " self.replace_numbers = replace_numbers\n", " self.stemming = stemming\n", " def fit(self, X, y=None):\n", " return self\n", " def transform(self, X, y=None):\n", " X_transformed = []\n", " for email in X:\n", " text = email_to_text(email) or \"\"\n", " if self.lower_case:\n", " text = text.lower()\n", " if self.replace_urls and urlextract.URLExtract() is not None:\n", " #replace URLs with the word \"URL\"\n", " urls = list(set(urlextract.URLExtract().find_urls(text)))\n", " urls.sort(key=lambda url: len(url), reverse=True)\n", " for url in urls:\n", " text = text.replace(url, \" URL \")\n", " if self.replace_numbers:\n", " text = re.sub(r'\\d+(?:\\.\\d*(?:[eE]\\d+))?', 'NUMBER', text)\n", " if self.remove_punctuation:\n", " text = re.sub(r'\\W+', ' ', text, flags=re.M)\n", " word_counts = Counter(text.split()) #split sentences into words using Python's split() method, which uses whitespaces for word boundaries\n", " if self.stemming and nltk.PorterStemmer() is not None:\n", " # stemming\n", " stemmed_word_counts = Counter()\n", " for word, count in word_counts.items():\n", " stemmed_word = nltk.PorterStemmer().stem(word)\n", " stemmed_word_counts[stemmed_word] += count\n", " word_counts = stemmed_word_counts\n", " X_transformed.append(word_counts)\n", " return np.array(X_transformed)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_few = spam_emails[:2]\n", "X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)\n", "X_few_wordcounts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Now we have the word counts, and we need to convert them to vectors. For this, we will build another transformer whose fit() method will build the vocabulary (an ordered list of the most common words) and whose transform() method will use the vocabulary to convert word counts to vectors. The output is a sparse matrix.\n", "from scipy.sparse import csr_matrix\n", "\n", "class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):\n", " def __init__(self, vocabulary_size=1000):\n", " self.vocabulary_size = vocabulary_size\n", " def fit(self, X, y=None):\n", " total_count = Counter()\n", " for word_count in X:\n", " for word, count in word_count.items():\n", " total_count[word] += min(count, 10)\n", " most_common = total_count.most_common()[:self.vocabulary_size]\n", " self.most_common_ = most_common\n", " self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}\n", " return self\n", " def transform(self, X, y=None):\n", " rows = []\n", " cols = []\n", " data = []\n", " for row, word_count in enumerate(X):\n", " for word, count in word_count.items():\n", " rows.append(row)\n", " cols.append(self.vocabulary_.get(word, 0))\n", " data.append(count)\n", " return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))\n", "vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)\n", "X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)\n", "X_few_vectors.toarray()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab_transformer.vocabulary_" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To fill: Try out several classifiers and see if you can build a great spam classifier, with both high recall and high precision" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "\n", "preprocess_pipeline = Pipeline([\n", " (\"email_to_wordcount\", EmailToWordCounterTransformer()),\n", " (\"wordcount_to_vector\", WordCounterToVectorTransformer()),\n", "])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.1" } }, "nbformat": 4, "nbformat_minor": 2 }