import pandas as pd
from helper_functions import prepare_data, replace_strings
from pprint import pprint
from IPython.display import Image
# load data
df_train = pd.read_csv("../../data/train.csv", index_col="PassengerId")
df_test = pd.read_csv("../../data/test.csv", index_col="PassengerId")
test_labels = pd.read_csv("../../data/test_labels.csv", index_col="PassengerId", squeeze=True)
# prepare data
df_train = prepare_data(df_train)
df_test = prepare_data(df_test, train_set=False)
# handle missing values in training data
embarked_mode = df_train.Embarked.mode()[0]
df_train["Embarked"].fillna(embarked_mode, inplace=True)
df_train.head()
Image(filename='../../images/Naive Bayes algorithm.png', width=1000)
example_table = {
"Sex": {"female": [0.15, 0.68],
"male": [0.85, 0.32]},
"Pclass": {1: [0.15, 0.40],
2: [0.18, 0.25],
3: [0.68, 0.35]},
"class_names": [0, 1],
"class_counts": [549, 342]
}
def create_table(df, label_column):
table = {}
# determine values for the label
value_counts = df[label_column].value_counts().sort_index()
table["class_names"] = value_counts.index.to_numpy()
table["class_counts"] = value_counts.values
# determine probabilities for the features
for feature in df.drop(label_column, axis=1).columns:
table[feature] = {}
# determine counts
counts = df.groupby(label_column)[feature].value_counts()
df_counts = counts.unstack(label_column)
# add one count to avoid "problem of rare values"
if df_counts.isna().any(axis=None):
df_counts.fillna(value=0, inplace=True)
df_counts += 1
# calculate probabilities
df_probabilities = df_counts / df_counts.sum()
for value in df_probabilities.index:
probabilities = df_probabilities.loc[value].to_numpy()
table[feature][value] = probabilities
return table
lookup_table = create_table(df_train, label_column="Survived")
pprint(lookup_table)
def predict_example(row, lookup_table):
class_estimates = lookup_table["class_counts"]
for feature in row.index:
try:
value = row[feature]
probabilities = lookup_table[feature][value]
class_estimates = class_estimates * probabilities
# skip in case "value" only occurs in test set but not in train set
# (i.e. "value" is not in "lookup_table")
except KeyError:
continue
index_max_class = class_estimates.argmax()
prediction = lookup_table["class_names"][index_max_class]
return prediction
predictions = df_test.apply(predict_example, axis=1, args=(lookup_table,))
predictions.head()
predictions_correct = predictions == test_labels
accuracy = predictions_correct.mean()
print(f"Accuracy: {accuracy:.3f}")
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
# data preparation
df_train = replace_strings(df_train)
X_train = df_train.drop("Survived", axis=1)
y_train = df_train.Survived
X_test = replace_strings(df_test)
y_test = test_labels
# use different sklearn Naive Bayes models
clfs = [GaussianNB(), MultinomialNB(), ComplementNB(), BernoulliNB()]
clfs_names = ["GaussianNB", "MultinomialNB", "ComplementNB", "BernoulliNB"]
print("NB Model\tAccuracy")
print("--------\t--------")
for clf, clf_name in zip(clfs, clfs_names):
clf.fit(X_train, y_train)
acc = clf.score(X_test, y_test)
print(f"{clf_name}\t{acc:.3f}")