In [1]:
import pandas as pd

from helper_functions import prepare_data, replace_strings

from pprint import pprint
from IPython.display import Image

Data Preparation

In [2]:
# load data
df_train = pd.read_csv("../../data/train.csv", index_col="PassengerId")
df_test = pd.read_csv("../../data/test.csv", index_col="PassengerId")
test_labels = pd.read_csv("../../data/test_labels.csv", index_col="PassengerId", squeeze=True)

# prepare data
df_train = prepare_data(df_train)
df_test = prepare_data(df_test, train_set=False)

# handle missing values in training data
embarked_mode = df_train.Embarked.mode()[0]
df_train["Embarked"].fillna(embarked_mode, inplace=True)

df_train.head()
Out[2]:
Sex Pclass Age_Group Embarked SibSp ParCh Survived
PassengerId
1 male 3 Adult S 1 0 0
2 female 1 Adult C 1 0 1
3 female 3 Adult S 0 0 1
4 female 1 Adult S 1 0 1
5 male 3 Adult S 0 0 0

Naive Bayes from Scratch

In [3]:
Image(filename='../../images/Naive Bayes algorithm.png', width=1000)
Out[3]:

1. Step of the Algorithm

In [4]:
example_table = {
    
    "Sex": {"female": [0.15, 0.68],
            "male": [0.85, 0.32]},
    
    "Pclass": {1: [0.15, 0.40],
               2: [0.18, 0.25],
               3: [0.68, 0.35]},
    
    "class_names": [0, 1],
    "class_counts": [549, 342]
}
In [5]:
def create_table(df, label_column):
    table = {}

    # determine values for the label
    value_counts = df[label_column].value_counts().sort_index()
    table["class_names"] = value_counts.index.to_numpy()
    table["class_counts"] = value_counts.values

    # determine probabilities for the features
    for feature in df.drop(label_column, axis=1).columns:
        table[feature] = {}

        # determine counts
        counts = df.groupby(label_column)[feature].value_counts()
        df_counts = counts.unstack(label_column)

        # add one count to avoid "problem of rare values"
        if df_counts.isna().any(axis=None):
            df_counts.fillna(value=0, inplace=True)
            df_counts += 1

        # calculate probabilities
        df_probabilities = df_counts / df_counts.sum()
        for value in df_probabilities.index:
            probabilities = df_probabilities.loc[value].to_numpy()
            table[feature][value] = probabilities
            
    return table
In [6]:
lookup_table = create_table(df_train, label_column="Survived")
pprint(lookup_table)
{'Age_Group': {'Adult': array([0.61748634, 0.61695906]),
               'Child': array([0.05282332, 0.11695906]),
               'Teenager': array([0.10200364, 0.11403509]),
               'Unknown': array([0.2276867 , 0.15204678])},
 'Embarked': {'C': array([0.13661202, 0.27192982]),
              'Q': array([0.0856102, 0.0877193]),
              'S': array([0.77777778, 0.64035088])},
 'ParCh': {0: array([0.80215827, 0.67048711]),
           1: array([0.0971223 , 0.18911175]),
           2: array([0.07374101, 0.11747851]),
           3: array([0.00539568, 0.01146132]),
           4: array([0.00899281, 0.00286533]),
           5: array([0.00899281, 0.00573066]),
           6: array([0.00359712, 0.00286533])},
 'Pclass': {1: array([0.14571949, 0.39766082]),
            2: array([0.17668488, 0.25438596]),
            3: array([0.67759563, 0.34795322])},
 'Sex': {'female': array([0.14754098, 0.68128655]),
         'male': array([0.85245902, 0.31871345])},
 'SibSp': {0: array([0.7176259 , 0.60458453]),
           1: array([0.17625899, 0.32378223]),
           2: array([0.02877698, 0.04011461]),
           3: array([0.02338129, 0.01432665]),
           4: array([0.02877698, 0.01146132]),
           5: array([0.01079137, 0.00286533]),
           8: array([0.01438849, 0.00286533])},
 'class_counts': array([549, 342], dtype=int64),
 'class_names': array([0, 1], dtype=int64)}

2. Step of the Algorithm

In [7]:
def predict_example(row, lookup_table):
    
    class_estimates = lookup_table["class_counts"]
    for feature in row.index:

        try:
            value = row[feature]
            probabilities = lookup_table[feature][value]
            class_estimates = class_estimates * probabilities

        # skip in case "value" only occurs in test set but not in train set
        # (i.e. "value" is not in "lookup_table")
        except KeyError:
            continue

    index_max_class = class_estimates.argmax()
    prediction = lookup_table["class_names"][index_max_class]
    
    return prediction
In [8]:
predictions = df_test.apply(predict_example, axis=1, args=(lookup_table,))
predictions.head()
Out[8]:
PassengerId
892    0
893    1
894    0
895    0
896    1
dtype: int64

Check Accuracy

In [9]:
predictions_correct = predictions == test_labels
accuracy = predictions_correct.mean()
print(f"Accuracy: {accuracy:.3f}")
Accuracy: 0.766

Comparison to Sklearn

In [10]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
In [11]:
# data preparation
df_train = replace_strings(df_train)
X_train = df_train.drop("Survived", axis=1)
y_train = df_train.Survived

X_test = replace_strings(df_test)
y_test = test_labels
In [12]:
# use different sklearn Naive Bayes models
clfs = [GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB()]
clfs_names = ["GaussianNB", "MultinomialNB", "ComplementNB", "BernoulliNB"]

print("NB Model\tAccuracy")
print("--------\t--------")
for clf, clf_name in zip(clfs, clfs_names):
    clf.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    
    print(f"{clf_name}\t{acc:.3f}")
NB Model	Accuracy
--------	--------
GaussianNB	0.763
MultinomialNB	0.768
ComplementNB	0.761
BernoulliNB	0.766
In [ ]: