**Chapter 3 – Classification**

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
 path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
 print("Saving figure", fig_id)
 if tight_layout:
 plt.tight_layout()
 plt.savefig(path, format='png', dpi=300)

# MNIST

## Download and Read the data

In [None]:
#To download the data set
from sklearn.datasets import fetch_openml
import sklearn
import numpy as np
def sort_by_target(mnist):
 reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
 reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
 mnist.data[:60000] = mnist.data[reorder_train]
 mnist.target[:60000] = mnist.target[reorder_train]
 mnist.data[60000:] = mnist.data[reorder_test + 60000]
 mnist.target[60000:] = mnist.target[reorder_test + 60000]
print('The scikit-learn version is {}.'.format(sklearn.__version__))
#If scikit-learn version is <0.20, please use conda update scikit-learn to update 

In [None]:
#Reading Data
mnist = fetch_openml('mnist_784',version = 1, cache = True)
mnist.target = mnist.target.astype(np.int8)
sort_by_target(mnist)

In [None]:
mnist.data.shape

In [None]:
X, y = mnist["data"], mnist["target"]
X.shape

In [None]:
some_digit = X[36000]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap = mpl.cm.binary,
 interpolation="nearest")

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
shuffle_index = np.random.permutation(60000)
X_train, y_train = mnist["data"][:60000], mnist["target"][:60000]
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

## Train a binary classifier:

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42)
#https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html
sgd_clf.fit(X_train, y_train_5)

In [None]:
sgd_clf.predict([some_digit])

## Performance Measures:

Measuring Accuracy Using Cross-Validation:

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Implementing corss-validation:

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X_train, y_train_5):
 clone_clf = clone(sgd_clf)
 X_train_folds = X_train[train_index]
 y_train_folds = (y_train_5[train_index])
 X_test_fold = X_train[test_index]
 y_test_fold = (y_train_5[test_index])

 clone_clf.fit(X_train_folds, y_train_folds)
 y_pred = clone_clf.predict(X_test_fold)
 n_correct = sum(y_pred == y_test_fold)
 print(n_correct / len(y_pred))

Check another naive classifier:

In [None]:
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
 def fit(self, X, y=None):
 pass
 def predict(self, X):
 return np.zeros((len(X), 1), dtype=bool)
never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_5, cv=3, scoring="accuracy")

Confusion Matrix: Count the number of times instances of class A are classfied as class B

In [None]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)
#cross_val_predict perform K fold cross-validation and returns the predictions made on each test fold
len(y_train_pred)

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_train_5, y_train_pred)

In [None]:
y_train_perfect_predictions = y_train_5
confusion_matrix(y_train_5, y_train_perfect_predictions)

Precision: TP/(TP+FP), Recall : TP/(TP+FN)

In [None]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train_5, y_train_pred)


In [None]:
recall_score(y_train_5, y_train_pred)

F_1 score: harmonic mean of precision and recall: 2/(1/pre+1/rec)

In [None]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)

Precision/Recall Tradeoff:![Screenshot%20from%202019-01-27%2017-27-36.png](attachment:Screenshot%20from%202019-01-27%2017-27-36.png)

In [None]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores

In [None]:
threshold = 200000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

Decide which Threshold to use: 

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,
 method="decision_function")
#by setting method = "decision_function", cross_val_predict returns the decision scores

In [None]:
#For all possible threshods:
from sklearn.metrics import precision_recall_curve
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html
precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
 plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
 plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
 plt.xlabel("Threshold", fontsize=16)
 plt.legend(loc="upper left", fontsize=16)
 plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-700000, 700000])
plt.show()

95% precision classfier:

In [None]:
y_train_pred_90 = (y_scores > 70000)
precision_score(y_train_5, y_train_pred_90)

In [None]:
plt.plot(recalls, precisions)

Another measurement - receiver operating characteristic (ROC): recall/1-specificity , specificity: true negative rate, 1-specificity: false negative rate

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
 plt.plot(fpr, tpr, linewidth=2, label=label)
 plt.plot([0, 1], [0, 1], 'k--')
 plt.axis([0, 1, 0, 1])
 plt.xlabel('False Positive Rate', fontsize=16)
 plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
#area under the curve 
roc_auc_score(y_train_5, y_scores)

Another classfier: RandomForestClassifier()

In [None]:
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3,
 method="predict_proba")
#predict_proba method returns an array containing a row per instance and a column per class, 
#each containing the probability that the given instance belongs to the given class

In [None]:
y_probas_forest

In [None]:
y_scores_forest = y_probas_forest[:, 1] # score = proba of positive class
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5,y_scores_forest)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
roc_auc_score(y_train_5, y_scores_forest)

## Multiclass Classification: 

one to all strategy : train 10 binary classifiers. 

In [None]:
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

In [None]:
some_digit_scores = sgd_clf.decision_function([some_digit]) #10 scores, one per class
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

one to one strategy: for each pair of digits, one classifier

In [None]:
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])

In [None]:
len(ovo_clf.estimators_)

In [None]:
some_digit_scores = ovo_clf.decision_function([some_digit])
some_digit_scores

In [None]:
#RandomForestClassifier:
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
forest_clf.predict_proba([some_digit])

cross-validation

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
#Better
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) #scaling the inputs
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

## Error Analysis

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

In [None]:
row_sums = conf_mx.sum(axis=1, keepdims=True)
norm_conf_mx = conf_mx / row_sums
np.fill_diagonal(norm_conf_mx, 0)
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)
plt.show()

Analyse individual errors:

In [None]:
cl_a, cl_b = 3, 5
# GN: rows with True value are selected
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)] #GN: '3' classified as 3
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)] #GN: '3' classified as 5
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)] #GN: '5' classified as 3
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)] #GN: '5' classified as 5

def plot_digits(instances, images_per_row=10, **options):
 size = 28
 images_per_row = min(len(instances), images_per_row)
 images = [instance.reshape(size,size) for instance in instances]
 n_rows = (len(instances) - 1) // images_per_row + 1
 row_images = []
 n_empty = n_rows * images_per_row - len(instances)
 images.append(np.zeros((size, size * n_empty)))
 for row in range(n_rows):
 rimages = images[row * images_per_row : (row + 1) * images_per_row]
 row_images.append(np.concatenate(rimages, axis=1))
 image = np.concatenate(row_images, axis=0)
 plt.imshow(image, cmap = mpl.cm.binary, **options)
 plt.axis("off")
 
plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()
#SGDclassifier is quite sensitive to image shifting and rotation.

## Multilabel Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier

y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
#https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.c_.html
y_multilabel = np.c_[y_train_large, y_train_odd] #concatenation

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_multilabel)

In [None]:
knn_clf.predict([some_digit])

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3, n_jobs=-1)
f1_score(y_multilabel, y_train_knn_pred, average="macro")
#This running may take time
# To give more weight to one of the label, set average = "weighted"
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

## Multioutput Classification

In [None]:
#Multioutput = multilabel + multiclass [multiouput system is not limited to classification problems]
#Build a system that removes noise from images (noisy image -> clean digit image)
#multilabel: one label per pixel
#multiclass: each label can have multiple values

In [None]:
noise = np.random.randint(0, 100, (len(X_train), 784))
X_train_mod = X_train + noise
noise = np.random.randint(0, 100, (len(X_test), 784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

In [None]:
some_index = 5500
def plot_digit(data):
 image = data.reshape(28, 28)
 plt.imshow(image, cmap = mpl.cm.binary,
 interpolation="nearest")
 plt.axis("off")
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
plt.show()

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digit(clean_digit)

## Extra Material: Random Classifier: 

In [None]:
from sklearn.dummy import DummyClassifier
dmy_clf = DummyClassifier()
y_probas_dmy = cross_val_predict(dmy_clf, X_train, y_train_5, cv=3, method="predict_proba")
y_scores_dmy = y_probas_dmy[:, 1]

In [None]:
fprr, tprr, thresholdsr = roc_curve(y_train_5, y_scores_dmy)
plot_roc_curve(fprr, tprr)

# Questions/Exercises

## 1.Try to build a classifier for the MNIST dataset that achieves over 97% accuracy on the test set.
Hint: the KNeighborsClassifier works quite well for this task; you just need to find good hyperparameter values (try a grid search on the weights and n_neighbors hyperparameters)

In [None]:
from sklearn.datasets import fetch_openml
import sklearn
import numpy as np
def sort_by_target(mnist):
 reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
 reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
 mnist.data[:60000] = mnist.data[reorder_train]
 mnist.target[:60000] = mnist.target[reorder_train]
 mnist.data[60000:] = mnist.data[reorder_test + 60000]
 mnist.target[60000:] = mnist.target[reorder_test + 60000]

In [None]:
print('The scikit-learn version is {}.'.format(sklearn.__version__)) #If scikit-learn version is <0.20, please use conda update scikit-learn to update 

In [None]:
#Reading Data
mnist = fetch_openml('mnist_784',version = 1, cache = True)
mnist.target = mnist.target.astype(np.int8)
sort_by_target(mnist)
shuffle_index = np.random.permutation(60000)
X_train, y_train = mnist["data"][:60000], mnist["target"][:60000]
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

In [None]:
# Classifier: KNeighborsClassifier()
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html


In [None]:
# Grid Search: GridSearchCV()
#https://scikit-learn.org/stable/modules/grid_search.html

## 2.Data Augmentation
Write a function that can shift an MNIST image in any direction (left, right, up or down) by one pixel. Then for each image in the training set create four shifted copies (one per direction) and add them to the training set. Finally, train your best model on this expanded training set and measure its accuracy on the test set. 

Hint: you can use shift() function from the scipy.ndimage.interpolation module (https://docs.scipy.org/doc/scipy-0.16.1/reference/generated/scipy.ndimage.interpolation.shift.html): shift(image, [2,1], cval=0, model="constant") shifts the image 2 pixels down and 1 pixel to the right. Moreover, the points outside the boundaries are filled with "cval=0".

Obersevation: Your model should perform even better! This techinique of artifically growing the training set is called "data augmentation" or "trainig set expansion"

In [None]:
#Give a function which takes the image and shift it $dx$ along the x axis and $dy$ along the y axis
def shift_image(image, dx, dy):
 image = image.reshape([28,28])
 shifted_image = #To fill 
 return shifted_image.reshape([-1])

In [None]:
#Test the function, taking one digit as example and plot it
image = X_train[1000]
shifted_left = shift_image(image,-5,0)
plt.imshow(shifted_left.reshape(28,28), interpolation = "nearest", cmap="Greys")

In [None]:
#For each training image, create four shifted images (one pixel to each direction) and put them into training set.
X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]



In [None]:
#Shuffle the new training set


In [None]:
#Train the new training set on the previous best classfier (KNeighborsClassifier) and show the accuracy.

## 3. Tackle the Titanic dataset: 
The goal is to predict whether a passenger survived based on attributes like age, sex, passenger class etc.
Download the data (https://project.inria.fr/chuan/course_material_classification/) which includes one train.csv and one test.csv. 

In [None]:
#Load the data into list
import pandas as pd
train_data = pd.read_csv() #To fill the path of train.csv
test_data = pd.read_csv() #To fill the path of test.csv

In [None]:
#To see the details of the data
train_data.head()
test_data.head()
#The meanings of the attributes: https://www.kaggle.com/c/titanic/data

In [None]:
#To see how much data is missing
train_data.info()

In [None]:
#Dispaly numerical attributes information 
train_data.describe()

In [None]:
#Reuse the DataframeSelector from last course to select specific attributes from the DataFrame
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
 def __init__(self, attribute_names):
 self.attribute_names = attribute_names
 def fit(self, X, y=None):
 return self
 def transform(self, X):
 return X[self.attribute_names]

In [None]:
#To fill: Build the pipeline for the numerical attributes "Age", "SibSp", "Parch", "Fare"
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline()
num_pipeline.fit_transform(train_data)


In [None]:
#Build the pipeline for the string categorical attributes "Pclass", "Sex", "Embarked"
from sklearn.preprocessing import OneHotEncoder
class MostFrequentImputer(BaseEstimator, TransformerMixin):
 def fit(self, X, y=None):
 self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
 index=X.columns)
 return self
 def transform(self, X, y=None):
 return X.fillna(self.most_frequent_)
cat_pipeline = Pipeline([
 ("select_cat", DataFrameSelector(["Pclass", "Sex", "Embarked"])),
 ("imputer", MostFrequentImputer()),
 ("cat_encoder", OneHotEncoder(sparse=False)),
 ])
cat_pipeline.fit_transform(train_data)

In [None]:
#Join the numerical and categorical pipelines
from sklearn.pipeline import FeatureUnion
preprocess_pipeline = FeatureUnion(transformer_list=[
 ("num_pipeline", num_pipeline),
 ("cat_pipeline", cat_pipeline),
 ])
X_train = preprocess_pipeline.fit_transform(train_data)

In [None]:
#To fill: get the labels for the training data
y_train = 

In [None]:
#To fill: Choose a binary classifier for training and have a prediction on the test data


In [None]:
#To fill: Use cross-validation to have an idea of how good our model is

In [None]:
#Option: choose another classifier to see if it improves the result

Note: To improve the result further, you could:

 1)Compare many more models and tune hyperparameters using cross validation and grid search,
 2)Do more feature engineering, for example:
 replace SibSp and Parch with their sum,
 try to identify parts of names that correlate well with the Survived attribute 
 (e.g. if the name contains "Countess", then survival seems more likely),
 3)try to convert numerical attributes to categorical attributes: for example, different age groups had very different survival rates (see below), so it may help to create an age bucket category and use it instead of the age. Similarly, it may be useful to have a special category for people traveling alone since only 30% of them survived.



## 4. Build a SPAM Classifier

In [None]:
#Download the examples of spam and ham from Apache SpamAssasin's public datasets. 
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
 if not os.path.isdir(spam_path):
 os.makedirs(spam_path)
 for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
 path = os.path.join(spam_path, filename)
 if not os.path.isfile(path):
 urllib.request.urlretrieve(url, path)
 tar_bz2_file = tarfile.open(path) #Unzip the datasets 
 tar_bz2_file.extractall(path=SPAM_PATH)
 tar_bz2_file.close()
fetch_spam_data()

In [None]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [None]:
len(ham_filenames)

In [None]:
#We can use Python's email module to parse these emails (this handles headers, encoding, and so on):
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
 directory = "spam" if is_spam else "easy_ham"
 with open(os.path.join(spam_path, directory, filename), "rb") as f:
 return email.parser.BytesParser(policy=email.policy.default).parse(f)
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [None]:
print(ham_emails[1].get_content().strip())

In [None]:
#To fill: split the datasets into a training set and a test set
#Hint: using train_test_split function from sklearn.model_selection 

X_train, X_test, y_train, y_test = 

**Email prepocessing - convert HTML to plain text: The following function first drops the section, then converts all tags to the word HYPERLINK, then it gets rid of all HTML tags, leaving only the plain text. For readability, it also replaces multiple newlines with single newlines, and finally it unescapes html entities (such as > or  ):


In [None]:
import re
from html import unescape

def html_to_plain_text(html):
 text = re.sub('.*?', '', html, flags=re.M | re.S | re.I)
 text = re.sub('', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
 text = re.sub('<.*?>', '', text, flags=re.M | re.S)
 text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
 return unescape(text)

def email_to_text(email):
 html = None
 for part in email.walk():
 ctype = part.get_content_type()
 if not ctype in ("text/plain", "text/html"):
 continue
 try:
 content = part.get_content()
 except: # in case of encoding issues
 content = str(part.get_payload())
 if ctype == "text/plain":
 return content
 else:
 html = content
 if html:
 return html_to_plain_text(html)


In [None]:
print(spam_emails[7].get_content().strip())

In [None]:
print(html_to_plain_text(spam_emails[7].get_content())[:100], "...")

In [None]:
print(email_to_text(spam_emails[7])[:100], "...")

**In the following: a data preparation pipeline is done to convert each mail into a feature vector indicating the presence or absence of each possible word. 
For example, if all emails only ever contain four words, "Hello", "how", "are", "you", then the email "Hello you Hello Hello you" would be converted into a vector [1,0,0,1] (existence) or [3,0,0,2] (counting)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import urlextract #pip3 install urlextract
from collections import Counter
import nltk #pip3 install nltk 
import numpy as np

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
 def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
 replace_urls=True, replace_numbers=True, stemming=True):
 self.strip_headers = strip_headers
 self.lower_case = lower_case
 self.remove_punctuation = remove_punctuation
 self.replace_urls = replace_urls
 self.replace_numbers = replace_numbers
 self.stemming = stemming
 def fit(self, X, y=None):
 return self
 def transform(self, X, y=None):
 X_transformed = []
 for email in X:
 text = email_to_text(email) or ""
 if self.lower_case:
 text = text.lower()
 if self.replace_urls and urlextract.URLExtract() is not None:
 #replace URLs with the word "URL"
 urls = list(set(urlextract.URLExtract().find_urls(text)))
 urls.sort(key=lambda url: len(url), reverse=True)
 for url in urls:
 text = text.replace(url, " URL ")
 if self.replace_numbers:
 text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
 if self.remove_punctuation:
 text = re.sub(r'\W+', ' ', text, flags=re.M)
 word_counts = Counter(text.split()) #split sentences into words using Python's split() method, which uses whitespaces for word boundaries
 if self.stemming and nltk.PorterStemmer() is not None:
 # stemming
 stemmed_word_counts = Counter()
 for word, count in word_counts.items():
 stemmed_word = nltk.PorterStemmer().stem(word)
 stemmed_word_counts[stemmed_word] += count
 word_counts = stemmed_word_counts
 X_transformed.append(word_counts)
 return np.array(X_transformed)

In [None]:
X_few = spam_emails[:2]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

In [None]:
#Now we have the word counts, and we need to convert them to vectors. For this, we will build another transformer whose fit() method will build the vocabulary (an ordered list of the most common words) and whose transform() method will use the vocabulary to convert word counts to vectors. The output is a sparse matrix.
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
 def __init__(self, vocabulary_size=1000):
 self.vocabulary_size = vocabulary_size
 def fit(self, X, y=None):
 total_count = Counter()
 for word_count in X:
 for word, count in word_count.items():
 total_count[word] += min(count, 10)
 most_common = total_count.most_common()[:self.vocabulary_size]
 self.most_common_ = most_common
 self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
 return self
 def transform(self, X, y=None):
 rows = []
 cols = []
 data = []
 for row, word_count in enumerate(X):
 for word, count in word_count.items():
 rows.append(row)
 cols.append(self.vocabulary_.get(word, 0))
 data.append(count)
 return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors.toarray()


In [None]:
vocab_transformer.vocabulary_

To fill: Try out several classifiers and see if you can build a great spam classifier, with both high recall and high precision

In [None]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
 ("email_to_wordcount", EmailToWordCounterTransformer()),
 ("wordcount_to_vector", WordCounterToVectorTransformer()),
])