In [19]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.impute import KNNImputer

# suppressing future warnings for pandas 3+
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# loading the dataset
file_path = "heart_disease_uci.csv"

# copies for various training and optimization of models

# binary
df = pd.read_csv(file_path)
# multiclass
df2 = df.copy()

# untuned
dforiginal = df.copy()

# binary using KNN for missing values
df_KNN = df.copy()
In [20]:
# Feature Distribution Visualization
# Written by Klayton DalPra

# plot histograms for continuous numerical variables
continuous_vars = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
better_labels = [ 
    'Age', 
    'Resting Blood Pressure', 
    'Cholesterol', 
    'Max Heart Rate', 
    'Lack of O2 to Heart (0=Normal, 4=Ischemia)'
]

fig, axes = plt.subplots(3, 2, figsize=(12, 12))
axes = axes.flatten()

# plot each continuous variable
for i, var in enumerate(continuous_vars):
    df[var].hist(ax=axes[i], bins=20, edgecolor='black')
    axes[i].set_title(better_labels[i])
    axes[i].set_xlabel(var)
    axes[i].set_ylabel("Frequency")

# plot the distribution of heart disease stages (0 to 4)
class_counts = df['num'].value_counts().sort_index()
class_labels = ['No Disease', 'Stage 1', 'Stage 2', 'Stage 3', 'Stage 4']
axes[5].bar(class_counts.index, class_counts.values, edgecolor='black')
axes[5].set_xticks(ticks=range(5))
axes[5].set_xticklabels(class_labels, rotation=45)
axes[5].set_title("Heart Disease Stage Distribution")
axes[5].set_xlabel("Disease Stage")
axes[5].set_ylabel("Count")

plt.suptitle("Feature Distributions and Disease Stage", fontsize=16)
plt.tight_layout()
plt.show()


# compute correlation matrix
correlation_matrix = df.corr(numeric_only=True)

# plot heatmap of correlations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()
No description has been provided for this image
No description has been provided for this image
In [21]:
# Untuned Logistic Regression Binary Classification Model

# used to convert categorical variables to numerical values
cp_mapping = {'typical angina': 0, 'atypical angina': 1, 'non-anginal': 2, 'asymptomatic': 3}
restecg_mapping = {'normal': 0, 'st-t abnormality': 1, 'lv hypertrophy': 2}
slope_mapping = {'downsloping': 0, 'flat': 1, 'upsloping': 2}
thal_mapping = {'normal': 0, 'fixed defect': 1, 'reversable defect': 2}


# Handling Missing Values Written by Klayton DalPra

# fill numerical missing values with the avg value of the column
dforiginal.fillna(dforiginal.mean(numeric_only=True), inplace=True)

# fill categorical missing values with the most common value
categorical_columns_all = [ 'cp', 'slope', 'thal', 'sex', 'exang', 'restecg', 'fbs']
testNum = 0
for col in categorical_columns_all:
    most_common = dforiginal[col].mode()[0]  # most common entry in the column
    dforiginal[col] = dforiginal[col].fillna(most_common).infer_objects(copy=False) 


# PREDICT STAGE (comment out)     OR      PREDICT YES/NO (leave in)
dforiginal['num'] = dforiginal['num'].apply(lambda x: 1 if x > 0 else 0) #converts stages 2,3,4 to 1


# convert gender to 0 and 1
dforiginal['sex'] = dforiginal['sex'].map({'Male': 0, 'Female': 1}).astype(int)

# convert chest pain type to 0, 1, 2, and 3
dforiginal['cp'] = dforiginal['cp'].map(cp_mapping).astype(int)

# convert fasting blood sugar > 120mg to 0 and 1
dforiginal['fbs'] = dforiginal['fbs'].map({False: 0, True: 1}).astype(int)

#convert restecg
dforiginal['restecg'] = dforiginal['restecg'].map(restecg_mapping).astype(int)

# convert exang
dforiginal['exang'] = dforiginal['exang'].map({False: 0, True: 1}).astype(int)

# convert slope
dforiginal['slope'] = dforiginal['slope'].map(slope_mapping).astype(int)

# convert thal
dforiginal['thal'] = dforiginal['thal'].map(thal_mapping).astype(int)




# define features and target variable
Xoriginal = dforiginal.drop(columns=['num','dataset', 'id'])
yoriginal = dforiginal['num']  # target variable

# split data into training (80%) and testing (20%) sets
X_trainoriginal, X_testoriginal, y_trainoriginal, y_testoriginal = train_test_split(Xoriginal, yoriginal, test_size=0.2, random_state=42)



# initialize and train logistic regression model
log_regoriginal = LogisticRegression(max_iter=1000, solver='liblinear')
log_regoriginal.fit(X_trainoriginal, y_trainoriginal)

# make predictions on test set
y_predoriginal = log_regoriginal.predict(X_testoriginal)

# evaluate model performance
accuracyoriginal = accuracy_score(y_testoriginal, y_predoriginal)
conf_matrixoriginal = confusion_matrix(y_testoriginal, y_predoriginal)
class_reportoriginal = classification_report(y_testoriginal, y_predoriginal, zero_division=1)

# print results
print("\nUntuned Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracyoriginal:.2f}")

print("Untuned Classification Report:")
print(class_reportoriginal)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrixoriginal, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_testoriginal), yticklabels=np.unique(y_testoriginal))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Untuned Logistic Regression Confusion Matrix Heatmap")
plt.show()
Untuned Logistic Regression Model Evaluation:
Accuracy: 0.80
Untuned Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.83      0.78        75
           1       0.87      0.79      0.83       109

    accuracy                           0.80       184
   macro avg       0.80      0.81      0.80       184
weighted avg       0.81      0.80      0.81       184

No description has been provided for this image
In [22]:
# Visualize Decision Boundary
# Written by Klayton DalPra
# reduce training and test data to 2D
pca_bin = PCA(n_components=2)
X_train_2D = pca_bin.fit_transform(X_trainoriginal)
X_test_2D = pca_bin.transform(X_testoriginal)

# train a new logistic regression model on the 2D projected data
log_reg_2D = LogisticRegression()
log_reg_2D.fit(X_train_2D, y_trainoriginal)

# create a mesh grid over the PCA 2D space
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]

# predict on the grid to get decision regions
Z = log_reg_2D.predict(grid_points).reshape(xx.shape)

# plot decision boundary and test points
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr")  # background decision regions
plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_testoriginal, edgecolor='k', cmap="bwr", alpha=0.8)
plt.title("Untuned Logistic Regression Decision Boundary (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="True Class (0 or 1)")
plt.grid(True)
plt.show()
No description has been provided for this image
In [23]:
# Logistic Regression Model Binary Tuned
# Written by Klayton DalPra



# Handling Missing Values

# fill numerical missing values with the median of the column
df.fillna(df.median(numeric_only=True), inplace=True)

# fill categorical missing values with the most common value

categorical_columns = [ 'cp', 'slope', 'thal', 'sex', 'exang']
testNum = 0
for col in categorical_columns:
    most_common = df[col].mode()[0]  # most common entry in the column
    df[col] = df[col].fillna(most_common).infer_objects(copy=False) 


# PREDICT STAGE (comment out)     OR      PREDICT YES/NO (leave in)
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)



# convert gender to 0 and 1
df['sex'] = df['sex'].map({'Male': 0, 'Female': 1}).astype(int)

# convert chest pain type to 0, 1, 2, and 3
df['cp'] = df['cp'].map(cp_mapping).astype(int)


# convert exang
df['exang'] = df['exang'].map({False: 0, True: 1}).astype(int)

# convert slope
df['slope'] = df['slope'].map(slope_mapping).astype(int)

# convert thal
df['thal'] = df['thal'].map(thal_mapping).astype(int)




# define features and target variable
X = df.drop(columns=['num','dataset','fbs', 'restecg', 'id']) #fbs and restecg dropped due to low mutual information from Jake Shinohara's Random Forest section of the project
y = df['num']  # target variable

# split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



# Normalize Continuous Variables
scaler = StandardScaler()

# fit and transform the training set
X_train = scaler.fit_transform(X_train)

# transform the test set
X_test = scaler.transform(X_test)

# initialize and train logistic regression model
log_reg = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')
log_reg.fit(X_train, y_train)

# make predictions on test set
y_pred = log_reg.predict(X_test)

# evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=1)

# print results
print("\nTuned Binary Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Tuned Logistic Regression Binary Confusion Matrix Heatmap")
plt.show()
Tuned Binary Logistic Regression Model Evaluation:
Accuracy: 0.82
No description has been provided for this image
In [24]:
# show classification report and bar graph visualization
print("Tuned Binary Classification Report:")
print(class_report)

# get precision, recall, and f1-score for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, zero_division=1)

# dataframe for metrics
metrics_df = pd.DataFrame({"Precision": precision, "Recall": recall, "F1-Score": f1}, index=np.unique(y_test))

# plot the metrics
metrics_df.plot(kind="bar", figsize=(10, 6))
plt.title("Precision, Recall, and F1-Score per Class (Tuned)")
plt.xlabel("Class Label")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.legend(loc="best")
plt.grid(axis="y")
plt.show()
Tuned Binary Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.80        82
           1       0.85      0.82      0.84       102

    accuracy                           0.82       184
   macro avg       0.82      0.82      0.82       184
weighted avg       0.82      0.82      0.82       184

No description has been provided for this image
In [25]:
# ROC Curve
y_probs = log_reg.predict_proba(X_test)[:, 1]

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Logistic Regression')
ax.legend(loc='lower right')
Out[25]:
<matplotlib.legend.Legend at 0x2a838d40290>
No description has been provided for this image
In [26]:
# PCA projection
# Written by Klayton DalPra
# reduce training and test data to 2D
pca_bin = PCA(n_components=2)
X_train_2D = pca_bin.fit_transform(X_train)
X_test_2D = pca_bin.transform(X_test)

# train a new logistic regression model on the 2D projected data
log_reg_2D = LogisticRegression()
log_reg_2D.fit(X_train_2D, y_train)

# create a mesh grid over the PCA 2D space
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]

# predict on the grid to get decision regions
Z = log_reg_2D.predict(grid_points).reshape(xx.shape)

# plot decision boundary and test points
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr")  # background decision regions
plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_test, edgecolor='k', cmap="bwr", alpha=0.8)
plt.title("Logistic Regression Decision Boundary (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="True Class (0 or 1)")
plt.grid(True)
plt.show()
No description has been provided for this image
In [27]:
# regularization impact
# Written by Klayton DalPra
# test multiple values of C (regularization strength)
C_vals = [0.01, 0.1, 1, 10, 100]
accs = []

for C in C_vals:
    model = LogisticRegression(C=C, max_iter=5000, solver='liblinear', class_weight='balanced')
    model.fit(X_train, y_train)
    accs.append(accuracy_score(y_test, model.predict(X_test)))

# Plot the different accuracy scores for different C values
plt.plot(C_vals, accs, marker='o')
plt.xscale('log')
plt.xticks(C_vals, labels=[str(c) for c in C_vals])  # Force decimal labels
plt.title("Effect of Regularization (C) on Accuracy")
plt.xlabel("C (Inverse of Regularization Strength)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
No description has been provided for this image
In [28]:
# attempt to optimize the model further using K nearest neighbors for missing values
# Written by Klayton DalPra
# source: https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html

# column conversions to numbers as done before
df_KNN['sex'] = df_KNN['sex'].map({'Male': 0, 'Female': 1})
df_KNN['exang'] = df_KNN['exang'].map({False: 0, True: 1})


df_KNN['cp'] = df_KNN['cp'].map(cp_mapping)
df_KNN['slope'] = df_KNN['slope'].map(slope_mapping)
df_KNN['thal'] = df_KNN['thal'].map(thal_mapping)
df_KNN['restecg'] = df_KNN['restecg'].map(restecg_mapping)
df_KNN['fbs'] = df_KNN['fbs'].map({False: 0, True: 1})


# columns to impute. Using all columns to help the KNN algorithm
impute_cols = ['cp', 'slope', 'thal', 'sex', 'exang', 'age', 'chol',
               'oldpeak', 'thalch', 'trestbps', 'fbs', 'restecg', 'ca']

# find the best K number of neighbors
best_k = None
best_acc = 0

for k in range(1, 11):
    imputer = KNNImputer(n_neighbors=k)
    knn_imputed = pd.DataFrame(imputer.fit_transform(df_KNN[impute_cols]), columns=impute_cols)

    for col in ['cp', 'slope', 'thal', 'sex', 'exang', 'fbs', 'restecg']:
        knn_imputed[col] = knn_imputed[col].round().astype(int)

    # use imputed values in the dataframe
    df_temp = df_KNN.copy()
    df_temp['num'] = df_temp['num'].apply(lambda x: 1 if x > 0 else 0)
    df_temp[impute_cols] = knn_imputed

    # drop restecg and fbs from the feature set
    X_temp = df_temp.drop(columns=['num', 'dataset', 'restecg', 'fbs', 'id'])
    y_temp = df_temp['num']

    # train-test split
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
        X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)
    
    # normalize
    scaler_temp = StandardScaler()
    X_train_temp = scaler_temp.fit_transform(X_train_temp)
    X_test_temp = scaler_temp.transform(X_test_temp)

    # train logistic regression
    model = LogisticRegression(C = 0.01, max_iter=5000, solver='liblinear', class_weight='balanced')
    model.fit(X_train_temp, y_train_temp)
    acc = accuracy_score(y_test_temp, model.predict(X_test_temp))

    print(f"K={k}: Accuracy={acc:.4f}")
    if acc > best_acc:
        best_acc = acc
        best_k = k

print(f"\nBest k for KNNImputer: {best_k} with accuracy {best_acc:.4f}")
K=1: Accuracy=0.8261
K=2: Accuracy=0.8261
K=3: Accuracy=0.8207
K=4: Accuracy=0.8315
K=5: Accuracy=0.8370
K=6: Accuracy=0.8261
K=7: Accuracy=0.8261
K=8: Accuracy=0.8261
K=9: Accuracy=0.8261
K=10: Accuracy=0.8261

Best k for KNNImputer: 5 with accuracy 0.8370
In [29]:
# gather metrics for the best KNN imputer
# Written by Klayton DalPra
# re-impute the data using best_k
knn_imputer = KNNImputer(n_neighbors=best_k)
knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_KNN[impute_cols]), columns=impute_cols)

for col in ['cp', 'slope', 'thal', 'sex', 'exang', 'fbs', 'restecg']:
    knn_imputed[col] = knn_imputed[col].round().astype(int)

# apply imputed data back to full DataFrame
df_KNN[impute_cols] = knn_imputed
# convert stages to 0 and 1
df_KNN['num'] = df_KNN['num'].apply(lambda x: 1 if x > 0 else 0)

# drop fbs and restecg
X_KNN = df_KNN.drop(columns=['num', 'dataset', 'fbs', 'restecg', 'id'])
y_KNN = df_KNN['num']

# split dataset into train and test sets
X_train_KNN, X_test_KNN, y_train_KNN, y_test_KNN = train_test_split(
    X_KNN, y_KNN, test_size=0.2, random_state=42, stratify=y_KNN
)

# normalize
scaler = StandardScaler()
X_train_KNN = scaler.fit_transform(X_train_KNN)
X_test_KNN = scaler.transform(X_test_KNN)

# train logistic regression
log_reg_KNN = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')
log_reg_KNN.fit(X_train_KNN, y_train_KNN)

# predict
y_pred_KNN = log_reg_KNN.predict(X_test_KNN)

# accuracy and classification report
accuracy = accuracy_score(y_test_KNN, y_pred_KNN)
print(f"\nFinal Accuracy with KNN k=5: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test_KNN, y_pred_KNN, zero_division=1))

# confusion matrix
conf_matrix_KNN = confusion_matrix(y_test_KNN, y_pred_KNN)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_KNN, annot=True, cmap="Blues", fmt="d",
            xticklabels=np.unique(y_test_KNN), yticklabels=np.unique(y_test_KNN))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Logistic Regression Confusion Matrix Heatmap (Tuned with KNN Imputation)")
plt.show()
Final Accuracy with KNN k=5: 0.84
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.82        82
           1       0.86      0.85      0.86       102

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184

No description has been provided for this image
In [30]:
# PCA projection for KNN imputation model
# Written by Klayton DalPra
# reduce training and test data to 2D
pca_bin = PCA(n_components=2)
X_train_2D = pca_bin.fit_transform(X_train_KNN)
X_test_2D = pca_bin.transform(X_test_KNN)


log_reg_2D = LogisticRegression()
log_reg_2D.fit(X_train_2D, y_train_KNN)

x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]


Z = log_reg_2D.predict(grid_points).reshape(xx.shape)


plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr")
plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_test, edgecolor='k', cmap="bwr", alpha=0.8)
plt.title("Logistic Regression Decision Boundary (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="True Class (0 or 1)")
plt.grid(True)
plt.show()
No description has been provided for this image
In [31]:
for class_label in np.unique(y_test):
    plt.figure(figsize=(10, 6))

    # Plot decision regions in background
    plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr")

    # Show only one class of test points
    mask = (y_test == class_label)
    plt.scatter(X_test_2D[mask, 0], X_test_2D[mask, 1], 
                c=['blue' if class_label == 0 else 'red'], 
                edgecolor='k', s=60, label=f"Class {class_label}", alpha=0.8)

    plt.title(f"Binary PCA Projection - Class {class_label} Only")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend()
    plt.grid(True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
In [32]:
# ROC Curve
y_probs = log_reg_KNN.predict_proba(X_test_KNN)[:, 1]

# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test_KNN, y_probs)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Logistic Regression (Tuned with KNN Imputation)')
ax.legend(loc='lower right')
Out[32]:
<matplotlib.legend.Legend at 0x2a8359ac9b0>
No description has been provided for this image
In [33]:
# Logistic Regression Model Multiclass
# Written by Klayton DalPra


# Handling Missing Values

# fill numerical missing values with the median of the column
df2.fillna(df2.median(numeric_only=True), inplace=True)

# fill categorical missing values with the most common value
for col in categorical_columns:
    most_common = df2[col].mode()[0]
    df2[col] = df2[col].fillna(most_common).infer_objects(copy=False) 


# convert gender to 0 and 1
df2['sex'] = df2['sex'].map({'Male': 0, 'Female': 1}).astype(int)

# convert chest pain type to 0, 1, 2, and 3
df2['cp'] = df2['cp'].map(cp_mapping).astype(int)

# convert exang
df2['exang'] = df2['exang'].map({False: 0, True: 1}).astype(int)

# convert slope
df2['slope'] = df2['slope'].map(slope_mapping).astype(int)

# convert thal
df2['thal'] = df2['thal'].map(thal_mapping).astype(int)




# define features and target variable
X = df2.drop(columns=['num','dataset','fbs', 'restecg', 'id'])
y = df2['num']  # target variable

# split data into training (80%) and testing (20%) sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



# Normalize Continuous Variables (converts big numbers to small numbers)
# this helped the model converge

scaler2 = StandardScaler()

# fit and transform the training set
X_train2 = scaler2.fit_transform(X_train2)

# Transform the test set
X_test2 = scaler2.transform(X_test2)

# initialize and train logistic regression model
log_reg2 = LogisticRegression(C=1,max_iter=5000, solver='lbfgs', class_weight='balanced',multi_class='multinomial')
log_reg2.fit(X_train2, y_train2)

# make predictions on test set
y_pred2 = log_reg2.predict(X_test2)

# evaluate model performance
accuracy2 = accuracy_score(y_test2, y_pred2)
conf_matrix2 = confusion_matrix(y_test2, y_pred2)
class_report2 = classification_report(y_test2, y_pred2, zero_division=1)

# print results
print("\nMulticlass Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy2:.2f}")

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix2, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_test2), yticklabels=np.unique(y_test2))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Multiclass Confusion Matrix Heatmap")
plt.show()

print("Multiclass Classification Report:")
print(class_report2)
Multiclass Logistic Regression Model Evaluation:
Accuracy: 0.57
No description has been provided for this image
Multiclass Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.82        82
           1       0.49      0.45      0.47        53
           2       0.38      0.27      0.32        22
           3       0.29      0.24      0.26        21
           4       0.19      0.83      0.31         6

    accuracy                           0.57       184
   macro avg       0.44      0.52      0.44       184
weighted avg       0.61      0.57      0.58       184

In [34]:
# get precision, recall, and f1-score for each class
precision2, recall2, f12, _ = precision_recall_fscore_support(y_test2, y_pred2, zero_division=1)

# dataframe for metrics
metrics_df2 = pd.DataFrame({"Precision": precision2, "Recall": recall2, "F1-Score": f12}, index=np.unique(y_test2))

# plot the metrics
metrics_df2.plot(kind="bar", figsize=(10, 6))
plt.title("Precision, Recall, and F1-Score per Class (Multiclass)")
plt.xlabel("Class Label")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.legend(loc="best")
plt.grid(axis="y")
plt.show()
No description has been provided for this image
In [35]:
# Effect of Regularization on Accuracy
# Written by Klayton DalPra
C_vals = [0.01, 0.1, 1, 10, 100]
accs = []

for C in C_vals:
    model = LogisticRegression(C=C, max_iter=5000, multi_class='multinomial', solver='lbfgs')
    model.fit(X_train2, y_train2)
    accs.append(accuracy_score(y_test2, model.predict(X_test2)))


    
plt.plot(C_vals, accs, marker='o')
plt.xscale('log')
plt.xticks(C_vals, labels=[str(c) for c in C_vals])
plt.title("Effect of Regularization (C) on Accuracy (Multiclass)")
plt.xlabel("C (Inverse of Regularization Strength)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
No description has been provided for this image
In [36]:
# PCA for multiple classes

# Written by Klayton DalPra


# reduce data to 2 dimensions using PCA
pca_multi = PCA(n_components=2)
X_train_2D = pca_multi.fit_transform(X_train2)
X_test_2D = pca_multi.transform(X_test2)

# train a logistic regression model on the PCA-reduced data
log_reg_pca = LogisticRegression(C=0.1,multi_class='multinomial', solver='lbfgs', max_iter=5000)
log_reg_pca.fit(X_train_2D, y_train2)

# create a mesh grid for background classification
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
                     np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]
Z = log_reg_pca.predict(grid_points).reshape(xx.shape)


# plot the decision regions
plt.figure(figsize=(10, 6))
num_classes = 5
cmap_light = ListedColormap(sns.color_palette("pastel", num_classes).as_hex())
cmap_bold = ListedColormap(sns.color_palette("deep", num_classes).as_hex())

plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap_light)

# plot actual test points
scatter = plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_test2, cmap=cmap_bold, edgecolor='k', s=60)
plt.legend(*scatter.legend_elements(), title="True Class")
plt.title("Multiclass PCA Projection with Multiclass Decision Regions")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()
No description has been provided for this image
In [37]:
for class_label in np.unique(y_test2):
    plt.figure(figsize=(10, 6))
    
    # Plot decision regions in background
    plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap_light)
    
    # Filter to show only one class at a time
    mask = (y_test2 == class_label)
    plt.scatter(X_test_2D[mask, 0], X_test_2D[mask, 1], 
                c=[cmap_bold(class_label)], edgecolor='k', s=60, label=f"Class {class_label}")
    
    plt.title(f"Multiclass PCA Projection - Class {class_label} Only")
    plt.xlabel("PCA Component 1")
    plt.ylabel("PCA Component 2")
    plt.legend()
    plt.grid(True)
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]: