In [19]:
# IMPORTS
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from matplotlib.colors import ListedColormap
from sklearn.impute import KNNImputer
# suppressing future warnings for pandas 3+
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# loading the dataset
file_path = "heart_disease_uci.csv"
# copies for various training and optimization of models
# binary
df = pd.read_csv(file_path)
# multiclass
df2 = df.copy()
# untuned
dforiginal = df.copy()
# binary using KNN for missing values
df_KNN = df.copy()
In [20]:
# Feature Distribution Visualization
# Written by Klayton DalPra
# plot histograms for continuous numerical variables
continuous_vars = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']
better_labels = [
'Age',
'Resting Blood Pressure',
'Cholesterol',
'Max Heart Rate',
'Lack of O2 to Heart (0=Normal, 4=Ischemia)'
]
fig, axes = plt.subplots(3, 2, figsize=(12, 12))
axes = axes.flatten()
# plot each continuous variable
for i, var in enumerate(continuous_vars):
df[var].hist(ax=axes[i], bins=20, edgecolor='black')
axes[i].set_title(better_labels[i])
axes[i].set_xlabel(var)
axes[i].set_ylabel("Frequency")
# plot the distribution of heart disease stages (0 to 4)
class_counts = df['num'].value_counts().sort_index()
class_labels = ['No Disease', 'Stage 1', 'Stage 2', 'Stage 3', 'Stage 4']
axes[5].bar(class_counts.index, class_counts.values, edgecolor='black')
axes[5].set_xticks(ticks=range(5))
axes[5].set_xticklabels(class_labels, rotation=45)
axes[5].set_title("Heart Disease Stage Distribution")
axes[5].set_xlabel("Disease Stage")
axes[5].set_ylabel("Count")
plt.suptitle("Feature Distributions and Disease Stage", fontsize=16)
plt.tight_layout()
plt.show()
# compute correlation matrix
correlation_matrix = df.corr(numeric_only=True)
# plot heatmap of correlations
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()
In [21]:
# Untuned Logistic Regression Binary Classification Model
# used to convert categorical variables to numerical values
cp_mapping = {'typical angina': 0, 'atypical angina': 1, 'non-anginal': 2, 'asymptomatic': 3}
restecg_mapping = {'normal': 0, 'st-t abnormality': 1, 'lv hypertrophy': 2}
slope_mapping = {'downsloping': 0, 'flat': 1, 'upsloping': 2}
thal_mapping = {'normal': 0, 'fixed defect': 1, 'reversable defect': 2}
# Handling Missing Values Written by Klayton DalPra
# fill numerical missing values with the avg value of the column
dforiginal.fillna(dforiginal.mean(numeric_only=True), inplace=True)
# fill categorical missing values with the most common value
categorical_columns_all = [ 'cp', 'slope', 'thal', 'sex', 'exang', 'restecg', 'fbs']
testNum = 0
for col in categorical_columns_all:
most_common = dforiginal[col].mode()[0] # most common entry in the column
dforiginal[col] = dforiginal[col].fillna(most_common).infer_objects(copy=False)
# PREDICT STAGE (comment out) OR PREDICT YES/NO (leave in)
dforiginal['num'] = dforiginal['num'].apply(lambda x: 1 if x > 0 else 0) #converts stages 2,3,4 to 1
# convert gender to 0 and 1
dforiginal['sex'] = dforiginal['sex'].map({'Male': 0, 'Female': 1}).astype(int)
# convert chest pain type to 0, 1, 2, and 3
dforiginal['cp'] = dforiginal['cp'].map(cp_mapping).astype(int)
# convert fasting blood sugar > 120mg to 0 and 1
dforiginal['fbs'] = dforiginal['fbs'].map({False: 0, True: 1}).astype(int)
#convert restecg
dforiginal['restecg'] = dforiginal['restecg'].map(restecg_mapping).astype(int)
# convert exang
dforiginal['exang'] = dforiginal['exang'].map({False: 0, True: 1}).astype(int)
# convert slope
dforiginal['slope'] = dforiginal['slope'].map(slope_mapping).astype(int)
# convert thal
dforiginal['thal'] = dforiginal['thal'].map(thal_mapping).astype(int)
# define features and target variable
Xoriginal = dforiginal.drop(columns=['num','dataset', 'id'])
yoriginal = dforiginal['num'] # target variable
# split data into training (80%) and testing (20%) sets
X_trainoriginal, X_testoriginal, y_trainoriginal, y_testoriginal = train_test_split(Xoriginal, yoriginal, test_size=0.2, random_state=42)
# initialize and train logistic regression model
log_regoriginal = LogisticRegression(max_iter=1000, solver='liblinear')
log_regoriginal.fit(X_trainoriginal, y_trainoriginal)
# make predictions on test set
y_predoriginal = log_regoriginal.predict(X_testoriginal)
# evaluate model performance
accuracyoriginal = accuracy_score(y_testoriginal, y_predoriginal)
conf_matrixoriginal = confusion_matrix(y_testoriginal, y_predoriginal)
class_reportoriginal = classification_report(y_testoriginal, y_predoriginal, zero_division=1)
# print results
print("\nUntuned Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracyoriginal:.2f}")
print("Untuned Classification Report:")
print(class_reportoriginal)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrixoriginal, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_testoriginal), yticklabels=np.unique(y_testoriginal))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Untuned Logistic Regression Confusion Matrix Heatmap")
plt.show()
Untuned Logistic Regression Model Evaluation: Accuracy: 0.80 Untuned Classification Report: precision recall f1-score support 0 0.73 0.83 0.78 75 1 0.87 0.79 0.83 109 accuracy 0.80 184 macro avg 0.80 0.81 0.80 184 weighted avg 0.81 0.80 0.81 184
In [22]:
# Visualize Decision Boundary
# Written by Klayton DalPra
# reduce training and test data to 2D
pca_bin = PCA(n_components=2)
X_train_2D = pca_bin.fit_transform(X_trainoriginal)
X_test_2D = pca_bin.transform(X_testoriginal)
# train a new logistic regression model on the 2D projected data
log_reg_2D = LogisticRegression()
log_reg_2D.fit(X_train_2D, y_trainoriginal)
# create a mesh grid over the PCA 2D space
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]
# predict on the grid to get decision regions
Z = log_reg_2D.predict(grid_points).reshape(xx.shape)
# plot decision boundary and test points
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr") # background decision regions
plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_testoriginal, edgecolor='k', cmap="bwr", alpha=0.8)
plt.title("Untuned Logistic Regression Decision Boundary (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="True Class (0 or 1)")
plt.grid(True)
plt.show()
In [23]:
# Logistic Regression Model Binary Tuned
# Written by Klayton DalPra
# Handling Missing Values
# fill numerical missing values with the median of the column
df.fillna(df.median(numeric_only=True), inplace=True)
# fill categorical missing values with the most common value
categorical_columns = [ 'cp', 'slope', 'thal', 'sex', 'exang']
testNum = 0
for col in categorical_columns:
most_common = df[col].mode()[0] # most common entry in the column
df[col] = df[col].fillna(most_common).infer_objects(copy=False)
# PREDICT STAGE (comment out) OR PREDICT YES/NO (leave in)
df['num'] = df['num'].apply(lambda x: 1 if x > 0 else 0)
# convert gender to 0 and 1
df['sex'] = df['sex'].map({'Male': 0, 'Female': 1}).astype(int)
# convert chest pain type to 0, 1, 2, and 3
df['cp'] = df['cp'].map(cp_mapping).astype(int)
# convert exang
df['exang'] = df['exang'].map({False: 0, True: 1}).astype(int)
# convert slope
df['slope'] = df['slope'].map(slope_mapping).astype(int)
# convert thal
df['thal'] = df['thal'].map(thal_mapping).astype(int)
# define features and target variable
X = df.drop(columns=['num','dataset','fbs', 'restecg', 'id']) #fbs and restecg dropped due to low mutual information from Jake Shinohara's Random Forest section of the project
y = df['num'] # target variable
# split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Normalize Continuous Variables
scaler = StandardScaler()
# fit and transform the training set
X_train = scaler.fit_transform(X_train)
# transform the test set
X_test = scaler.transform(X_test)
# initialize and train logistic regression model
log_reg = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')
log_reg.fit(X_train, y_train)
# make predictions on test set
y_pred = log_reg.predict(X_test)
# evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, zero_division=1)
# print results
print("\nTuned Binary Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Tuned Logistic Regression Binary Confusion Matrix Heatmap")
plt.show()
Tuned Binary Logistic Regression Model Evaluation: Accuracy: 0.82
In [24]:
# show classification report and bar graph visualization
print("Tuned Binary Classification Report:")
print(class_report)
# get precision, recall, and f1-score for each class
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, zero_division=1)
# dataframe for metrics
metrics_df = pd.DataFrame({"Precision": precision, "Recall": recall, "F1-Score": f1}, index=np.unique(y_test))
# plot the metrics
metrics_df.plot(kind="bar", figsize=(10, 6))
plt.title("Precision, Recall, and F1-Score per Class (Tuned)")
plt.xlabel("Class Label")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.legend(loc="best")
plt.grid(axis="y")
plt.show()
Tuned Binary Classification Report: precision recall f1-score support 0 0.79 0.82 0.80 82 1 0.85 0.82 0.84 102 accuracy 0.82 184 macro avg 0.82 0.82 0.82 184 weighted avg 0.82 0.82 0.82 184
In [25]:
# ROC Curve
y_probs = log_reg.predict_proba(X_test)[:, 1]
# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Logistic Regression')
ax.legend(loc='lower right')
Out[25]:
<matplotlib.legend.Legend at 0x2a838d40290>
In [26]:
# PCA projection
# Written by Klayton DalPra
# reduce training and test data to 2D
pca_bin = PCA(n_components=2)
X_train_2D = pca_bin.fit_transform(X_train)
X_test_2D = pca_bin.transform(X_test)
# train a new logistic regression model on the 2D projected data
log_reg_2D = LogisticRegression()
log_reg_2D.fit(X_train_2D, y_train)
# create a mesh grid over the PCA 2D space
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]
# predict on the grid to get decision regions
Z = log_reg_2D.predict(grid_points).reshape(xx.shape)
# plot decision boundary and test points
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr") # background decision regions
plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_test, edgecolor='k', cmap="bwr", alpha=0.8)
plt.title("Logistic Regression Decision Boundary (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="True Class (0 or 1)")
plt.grid(True)
plt.show()
In [27]:
# regularization impact
# Written by Klayton DalPra
# test multiple values of C (regularization strength)
C_vals = [0.01, 0.1, 1, 10, 100]
accs = []
for C in C_vals:
model = LogisticRegression(C=C, max_iter=5000, solver='liblinear', class_weight='balanced')
model.fit(X_train, y_train)
accs.append(accuracy_score(y_test, model.predict(X_test)))
# Plot the different accuracy scores for different C values
plt.plot(C_vals, accs, marker='o')
plt.xscale('log')
plt.xticks(C_vals, labels=[str(c) for c in C_vals]) # Force decimal labels
plt.title("Effect of Regularization (C) on Accuracy")
plt.xlabel("C (Inverse of Regularization Strength)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
In [28]:
# attempt to optimize the model further using K nearest neighbors for missing values
# Written by Klayton DalPra
# source: https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html
# column conversions to numbers as done before
df_KNN['sex'] = df_KNN['sex'].map({'Male': 0, 'Female': 1})
df_KNN['exang'] = df_KNN['exang'].map({False: 0, True: 1})
df_KNN['cp'] = df_KNN['cp'].map(cp_mapping)
df_KNN['slope'] = df_KNN['slope'].map(slope_mapping)
df_KNN['thal'] = df_KNN['thal'].map(thal_mapping)
df_KNN['restecg'] = df_KNN['restecg'].map(restecg_mapping)
df_KNN['fbs'] = df_KNN['fbs'].map({False: 0, True: 1})
# columns to impute. Using all columns to help the KNN algorithm
impute_cols = ['cp', 'slope', 'thal', 'sex', 'exang', 'age', 'chol',
'oldpeak', 'thalch', 'trestbps', 'fbs', 'restecg', 'ca']
# find the best K number of neighbors
best_k = None
best_acc = 0
for k in range(1, 11):
imputer = KNNImputer(n_neighbors=k)
knn_imputed = pd.DataFrame(imputer.fit_transform(df_KNN[impute_cols]), columns=impute_cols)
for col in ['cp', 'slope', 'thal', 'sex', 'exang', 'fbs', 'restecg']:
knn_imputed[col] = knn_imputed[col].round().astype(int)
# use imputed values in the dataframe
df_temp = df_KNN.copy()
df_temp['num'] = df_temp['num'].apply(lambda x: 1 if x > 0 else 0)
df_temp[impute_cols] = knn_imputed
# drop restecg and fbs from the feature set
X_temp = df_temp.drop(columns=['num', 'dataset', 'restecg', 'fbs', 'id'])
y_temp = df_temp['num']
# train-test split
X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(
X_temp, y_temp, test_size=0.2, random_state=42, stratify=y_temp)
# normalize
scaler_temp = StandardScaler()
X_train_temp = scaler_temp.fit_transform(X_train_temp)
X_test_temp = scaler_temp.transform(X_test_temp)
# train logistic regression
model = LogisticRegression(C = 0.01, max_iter=5000, solver='liblinear', class_weight='balanced')
model.fit(X_train_temp, y_train_temp)
acc = accuracy_score(y_test_temp, model.predict(X_test_temp))
print(f"K={k}: Accuracy={acc:.4f}")
if acc > best_acc:
best_acc = acc
best_k = k
print(f"\nBest k for KNNImputer: {best_k} with accuracy {best_acc:.4f}")
K=1: Accuracy=0.8261 K=2: Accuracy=0.8261 K=3: Accuracy=0.8207 K=4: Accuracy=0.8315 K=5: Accuracy=0.8370 K=6: Accuracy=0.8261 K=7: Accuracy=0.8261 K=8: Accuracy=0.8261 K=9: Accuracy=0.8261 K=10: Accuracy=0.8261 Best k for KNNImputer: 5 with accuracy 0.8370
In [29]:
# gather metrics for the best KNN imputer
# Written by Klayton DalPra
# re-impute the data using best_k
knn_imputer = KNNImputer(n_neighbors=best_k)
knn_imputed = pd.DataFrame(knn_imputer.fit_transform(df_KNN[impute_cols]), columns=impute_cols)
for col in ['cp', 'slope', 'thal', 'sex', 'exang', 'fbs', 'restecg']:
knn_imputed[col] = knn_imputed[col].round().astype(int)
# apply imputed data back to full DataFrame
df_KNN[impute_cols] = knn_imputed
# convert stages to 0 and 1
df_KNN['num'] = df_KNN['num'].apply(lambda x: 1 if x > 0 else 0)
# drop fbs and restecg
X_KNN = df_KNN.drop(columns=['num', 'dataset', 'fbs', 'restecg', 'id'])
y_KNN = df_KNN['num']
# split dataset into train and test sets
X_train_KNN, X_test_KNN, y_train_KNN, y_test_KNN = train_test_split(
X_KNN, y_KNN, test_size=0.2, random_state=42, stratify=y_KNN
)
# normalize
scaler = StandardScaler()
X_train_KNN = scaler.fit_transform(X_train_KNN)
X_test_KNN = scaler.transform(X_test_KNN)
# train logistic regression
log_reg_KNN = LogisticRegression(max_iter=5000, solver='liblinear', class_weight='balanced')
log_reg_KNN.fit(X_train_KNN, y_train_KNN)
# predict
y_pred_KNN = log_reg_KNN.predict(X_test_KNN)
# accuracy and classification report
accuracy = accuracy_score(y_test_KNN, y_pred_KNN)
print(f"\nFinal Accuracy with KNN k=5: {accuracy:.2f}")
print("Classification Report:")
print(classification_report(y_test_KNN, y_pred_KNN, zero_division=1))
# confusion matrix
conf_matrix_KNN = confusion_matrix(y_test_KNN, y_pred_KNN)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_KNN, annot=True, cmap="Blues", fmt="d",
xticklabels=np.unique(y_test_KNN), yticklabels=np.unique(y_test_KNN))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Logistic Regression Confusion Matrix Heatmap (Tuned with KNN Imputation)")
plt.show()
Final Accuracy with KNN k=5: 0.84 Classification Report: precision recall f1-score support 0 0.82 0.83 0.82 82 1 0.86 0.85 0.86 102 accuracy 0.84 184 macro avg 0.84 0.84 0.84 184 weighted avg 0.84 0.84 0.84 184
In [30]:
# PCA projection for KNN imputation model
# Written by Klayton DalPra
# reduce training and test data to 2D
pca_bin = PCA(n_components=2)
X_train_2D = pca_bin.fit_transform(X_train_KNN)
X_test_2D = pca_bin.transform(X_test_KNN)
log_reg_2D = LogisticRegression()
log_reg_2D.fit(X_train_2D, y_train_KNN)
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300), np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]
Z = log_reg_2D.predict(grid_points).reshape(xx.shape)
plt.figure(figsize=(10, 6))
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr")
plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_test, edgecolor='k', cmap="bwr", alpha=0.8)
plt.title("Logistic Regression Decision Boundary (PCA Projection)")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.colorbar(label="True Class (0 or 1)")
plt.grid(True)
plt.show()
In [31]:
for class_label in np.unique(y_test):
plt.figure(figsize=(10, 6))
# Plot decision regions in background
plt.contourf(xx, yy, Z, alpha=0.3, cmap="bwr")
# Show only one class of test points
mask = (y_test == class_label)
plt.scatter(X_test_2D[mask, 0], X_test_2D[mask, 1],
c=['blue' if class_label == 0 else 'red'],
edgecolor='k', s=60, label=f"Class {class_label}", alpha=0.8)
plt.title(f"Binary PCA Projection - Class {class_label} Only")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.show()
In [32]:
# ROC Curve
y_probs = log_reg_KNN.predict_proba(X_test_KNN)[:, 1]
# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_test_KNN, y_probs)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})')
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Logistic Regression (Tuned with KNN Imputation)')
ax.legend(loc='lower right')
Out[32]:
<matplotlib.legend.Legend at 0x2a8359ac9b0>
In [33]:
# Logistic Regression Model Multiclass
# Written by Klayton DalPra
# Handling Missing Values
# fill numerical missing values with the median of the column
df2.fillna(df2.median(numeric_only=True), inplace=True)
# fill categorical missing values with the most common value
for col in categorical_columns:
most_common = df2[col].mode()[0]
df2[col] = df2[col].fillna(most_common).infer_objects(copy=False)
# convert gender to 0 and 1
df2['sex'] = df2['sex'].map({'Male': 0, 'Female': 1}).astype(int)
# convert chest pain type to 0, 1, 2, and 3
df2['cp'] = df2['cp'].map(cp_mapping).astype(int)
# convert exang
df2['exang'] = df2['exang'].map({False: 0, True: 1}).astype(int)
# convert slope
df2['slope'] = df2['slope'].map(slope_mapping).astype(int)
# convert thal
df2['thal'] = df2['thal'].map(thal_mapping).astype(int)
# define features and target variable
X = df2.drop(columns=['num','dataset','fbs', 'restecg', 'id'])
y = df2['num'] # target variable
# split data into training (80%) and testing (20%) sets
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Normalize Continuous Variables (converts big numbers to small numbers)
# this helped the model converge
scaler2 = StandardScaler()
# fit and transform the training set
X_train2 = scaler2.fit_transform(X_train2)
# Transform the test set
X_test2 = scaler2.transform(X_test2)
# initialize and train logistic regression model
log_reg2 = LogisticRegression(C=1,max_iter=5000, solver='lbfgs', class_weight='balanced',multi_class='multinomial')
log_reg2.fit(X_train2, y_train2)
# make predictions on test set
y_pred2 = log_reg2.predict(X_test2)
# evaluate model performance
accuracy2 = accuracy_score(y_test2, y_pred2)
conf_matrix2 = confusion_matrix(y_test2, y_pred2)
class_report2 = classification_report(y_test2, y_pred2, zero_division=1)
# print results
print("\nMulticlass Logistic Regression Model Evaluation:")
print(f"Accuracy: {accuracy2:.2f}")
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix2, annot=True, cmap="Blues", fmt="d", xticklabels=np.unique(y_test2), yticklabels=np.unique(y_test2))
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Multiclass Confusion Matrix Heatmap")
plt.show()
print("Multiclass Classification Report:")
print(class_report2)
Multiclass Logistic Regression Model Evaluation: Accuracy: 0.57
Multiclass Classification Report: precision recall f1-score support 0 0.86 0.79 0.82 82 1 0.49 0.45 0.47 53 2 0.38 0.27 0.32 22 3 0.29 0.24 0.26 21 4 0.19 0.83 0.31 6 accuracy 0.57 184 macro avg 0.44 0.52 0.44 184 weighted avg 0.61 0.57 0.58 184
In [34]:
# get precision, recall, and f1-score for each class
precision2, recall2, f12, _ = precision_recall_fscore_support(y_test2, y_pred2, zero_division=1)
# dataframe for metrics
metrics_df2 = pd.DataFrame({"Precision": precision2, "Recall": recall2, "F1-Score": f12}, index=np.unique(y_test2))
# plot the metrics
metrics_df2.plot(kind="bar", figsize=(10, 6))
plt.title("Precision, Recall, and F1-Score per Class (Multiclass)")
plt.xlabel("Class Label")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.legend(loc="best")
plt.grid(axis="y")
plt.show()
In [35]:
# Effect of Regularization on Accuracy
# Written by Klayton DalPra
C_vals = [0.01, 0.1, 1, 10, 100]
accs = []
for C in C_vals:
model = LogisticRegression(C=C, max_iter=5000, multi_class='multinomial', solver='lbfgs')
model.fit(X_train2, y_train2)
accs.append(accuracy_score(y_test2, model.predict(X_test2)))
plt.plot(C_vals, accs, marker='o')
plt.xscale('log')
plt.xticks(C_vals, labels=[str(c) for c in C_vals])
plt.title("Effect of Regularization (C) on Accuracy (Multiclass)")
plt.xlabel("C (Inverse of Regularization Strength)")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
In [36]:
# PCA for multiple classes
# Written by Klayton DalPra
# reduce data to 2 dimensions using PCA
pca_multi = PCA(n_components=2)
X_train_2D = pca_multi.fit_transform(X_train2)
X_test_2D = pca_multi.transform(X_test2)
# train a logistic regression model on the PCA-reduced data
log_reg_pca = LogisticRegression(C=0.1,multi_class='multinomial', solver='lbfgs', max_iter=5000)
log_reg_pca.fit(X_train_2D, y_train2)
# create a mesh grid for background classification
x_min, x_max = X_train_2D[:, 0].min() - 1, X_train_2D[:, 0].max() + 1
y_min, y_max = X_train_2D[:, 1].min() - 1, X_train_2D[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
np.linspace(y_min, y_max, 300))
grid_points = np.c_[xx.ravel(), yy.ravel()]
Z = log_reg_pca.predict(grid_points).reshape(xx.shape)
# plot the decision regions
plt.figure(figsize=(10, 6))
num_classes = 5
cmap_light = ListedColormap(sns.color_palette("pastel", num_classes).as_hex())
cmap_bold = ListedColormap(sns.color_palette("deep", num_classes).as_hex())
plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap_light)
# plot actual test points
scatter = plt.scatter(X_test_2D[:, 0], X_test_2D[:, 1], c=y_test2, cmap=cmap_bold, edgecolor='k', s=60)
plt.legend(*scatter.legend_elements(), title="True Class")
plt.title("Multiclass PCA Projection with Multiclass Decision Regions")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()
In [37]:
for class_label in np.unique(y_test2):
plt.figure(figsize=(10, 6))
# Plot decision regions in background
plt.contourf(xx, yy, Z, alpha=0.3, cmap=cmap_light)
# Filter to show only one class at a time
mask = (y_test2 == class_label)
plt.scatter(X_test_2D[mask, 0], X_test_2D[mask, 1],
c=[cmap_bold(class_label)], edgecolor='k', s=60, label=f"Class {class_label}")
plt.title(f"Multiclass PCA Projection - Class {class_label} Only")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend()
plt.grid(True)
plt.show()
In [ ]: