Complete ML workflow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import KFold

Exploratory analysis

  1. Understand the Data Structure and Summary

    1. Load and Inspect

    2. Descriptive Statistics

  2. Analyze the Target Variable

    1. Check Data Type (Ensure your target variable is appropriately represented)

    2. Determine the frequency of each class in your binary target variable

  3. Analyze features

    1. Visualize the distributions of features

  4. Identify and Handle Missing Values

Understand the Data Structure and Summary

# Download the test dataset
! curl https://naicno.github.io/BioNT_Module2_handson/_downloads/d072ac9ebbb200e56a8125b33b887657/TCGA_InfoWithGrade.csv -o TCGA_InfoWithGrade.csv
! ls -lh TCGA_InfoWithGrade.csv
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 86432  100 86432    0     0   426k      0 --:--:-- --:--:-- --:--:--  428k
-rw-r--r-- 1 runner docker 85K Jun  5 12:07 TCGA_InfoWithGrade.csv
# Read the dataset to get the glioma dataframe
gliomas = pd.read_csv('TCGA_InfoWithGrade.csv')
gliomas.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Grade             839 non-null    float64
 1   Gender            840 non-null    float64
 2   Age_at_diagnosis  839 non-null    float64
 3   Race              839 non-null    float64
 4   IDH1              839 non-null    float64
 5   TP53              839 non-null    float64
 6   ATRX              839 non-null    float64
 7   PTEN              839 non-null    float64
 8   EGFR              840 non-null    float64
 9   CIC               839 non-null    float64
 10  MUC16             839 non-null    float64
 11  PIK3CA            839 non-null    float64
 12  NF1               839 non-null    float64
 13  PIK3R1            840 non-null    float64
 14  FUBP1             839 non-null    float64
 15  RB1               839 non-null    float64
 16  NOTCH1            840 non-null    float64
 17  BCOR              839 non-null    float64
 18  CSMD3             839 non-null    float64
 19  SMARCA4           839 non-null    float64
 20  GRIN2A            839 non-null    float64
 21  IDH2              840 non-null    float64
 22  FAT4              839 non-null    float64
 23  PDGFRA            840 non-null    float64
 24  ATRX_xNA          630 non-null    float64
 25  IDH1_xNA          168 non-null    float64
dtypes: float64(26)
memory usage: 170.8 KB
# Inspect the first few rows of the DataFrame
gliomas.head()
Grade Gender Age_at_diagnosis Race IDH1 TP53 ATRX PTEN EGFR CIC ... NOTCH1 BCOR CSMD3 SMARCA4 GRIN2A IDH2 FAT4 PDGFRA ATRX_xNA IDH1_xNA
0 0.0 0.0 51.30 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 38.72 0.0 1.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
2 0.0 0.0 35.17 0.0 1.0 1.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 NaN
3 0.0 1.0 32.78 0.0 1.0 1.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 1.0 NaN
4 0.0 0.0 31.51 0.0 1.0 1.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 NaN NaN

5 rows × 26 columns

# Quick summary statistics of the DataFrame
gliomas.describe()
Grade Gender Age_at_diagnosis Race IDH1 TP53 ATRX PTEN EGFR CIC ... NOTCH1 BCOR CSMD3 SMARCA4 GRIN2A IDH2 FAT4 PDGFRA ATRX_xNA IDH1_xNA
count 839.000000 840.000000 839.000000 839.000000 839.000000 839.000000 839.000000 839.000000 840.000000 839.000000 ... 840.000000 839.000000 839.000000 839.000000 839.000000 840.000000 839.000000 840.000000 630.000000 168.000000
mean 0.419547 0.417857 50.935411 0.107271 0.481526 0.414779 0.258641 0.168057 0.133333 0.132300 ... 0.045238 0.034565 0.032181 0.032181 0.032181 0.027381 0.027414 0.026190 0.265079 0.488095
std 0.493779 0.493500 15.702339 0.369392 0.499957 0.492978 0.438149 0.374140 0.340137 0.339019 ... 0.207950 0.182784 0.176586 0.176586 0.176586 0.163288 0.163383 0.159797 0.441726 0.501353
min 0.000000 0.000000 14.420000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 38.055000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 0.000000 0.000000 51.550000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 1.000000 1.000000 62.800000 0.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000
max 1.000000 1.000000 89.290000 3.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 26 columns

Analyze the Target Variable

# Check data types of the target variable 'Grade'
gliomas["Grade"].dtype
dtype('float64')
# Get the counts of each grade in the 'Grade' column
gliomas["Grade"].value_counts()
Grade
0.0    487
1.0    352
Name: count, dtype: int64
# Get the fractions of each grade in the 'Grade' column
gliomas["Grade"].value_counts(normalize=True)
Grade
0.0    0.580453
1.0    0.419547
Name: proportion, dtype: float64

Analyze features

# Plot the distribution of data in all columns
fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(15, 20))
axs = axs.flatten()

# Iterate over each column and plot the distribution
for i, column in enumerate(gliomas.drop(columns=["Grade"]).columns):
    axs[i].hist(gliomas[column], bins=20, color='skyblue', edgecolor='black')
    axs[i].set_title(column)
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()
../_images/c8f52d15e57ec1386ad0ddb06254df13ef40f0641c7b6f28d1c170f8e32b388a.png

Identify and Handle Missing Values

  • Note that np.nan values are inserted into the original dataset specifically to demonstrate techniques that help handel these datasets

# Calculate the % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100
Grade                0.119048
Gender               0.000000
Age_at_diagnosis     0.119048
Race                 0.119048
IDH1                 0.119048
TP53                 0.119048
ATRX                 0.119048
PTEN                 0.119048
EGFR                 0.000000
CIC                  0.119048
MUC16                0.119048
PIK3CA               0.119048
NF1                  0.119048
PIK3R1               0.000000
FUBP1                0.119048
RB1                  0.119048
NOTCH1               0.000000
BCOR                 0.119048
CSMD3                0.119048
SMARCA4              0.119048
GRIN2A               0.119048
IDH2                 0.000000
FAT4                 0.119048
PDGFRA               0.000000
ATRX_xNA            25.000000
IDH1_xNA            80.000000
dtype: float64

Delete rows with missing values in target variable

# Remove rows with missing values in the 'Grade' column
gliomas.dropna(subset=["Grade"], axis=0, inplace=True)
# % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100
Grade                0.000000
Gender               0.000000
Age_at_diagnosis     0.000000
Race                 0.000000
IDH1                 0.000000
TP53                 0.000000
ATRX                 0.000000
PTEN                 0.000000
EGFR                 0.000000
CIC                  0.000000
MUC16                0.000000
PIK3CA               0.000000
NF1                  0.000000
PIK3R1               0.000000
FUBP1                0.000000
RB1                  0.000000
NOTCH1               0.000000
BCOR                 0.000000
CSMD3                0.000000
SMARCA4              0.000000
GRIN2A               0.000000
IDH2                 0.000000
FAT4                 0.000000
PDGFRA               0.000000
ATRX_xNA            24.910608
IDH1_xNA            79.976162
dtype: float64

Keep columns with at least 95% non-missing values

threshold = int(0.95 * len(gliomas))
# Keep columns with at least 95% non-missing values
gliomas.dropna(thresh=threshold, axis=1, inplace=True)
# % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100
Grade               0.0
Gender              0.0
Age_at_diagnosis    0.0
Race                0.0
IDH1                0.0
TP53                0.0
ATRX                0.0
PTEN                0.0
EGFR                0.0
CIC                 0.0
MUC16               0.0
PIK3CA              0.0
NF1                 0.0
PIK3R1              0.0
FUBP1               0.0
RB1                 0.0
NOTCH1              0.0
BCOR                0.0
CSMD3               0.0
SMARCA4             0.0
GRIN2A              0.0
IDH2                0.0
FAT4                0.0
PDGFRA              0.0
dtype: float64
# Plot the distribution of data in all columns
fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(15, 20))
axs = axs.flatten()

# Iterate over each column and plot the distribution
for i, column in enumerate(gliomas.drop(columns=["Grade"]).columns):
    axs[i].hist(gliomas[column], bins=20, color='skyblue', edgecolor='black')
    axs[i].set_title(column)
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()
../_images/f7a1df908b3695b7ec5796b2f486fb7ea39031816d717da42f46b9f7aa2258ec.png
# Drop the 'Race' column
gliomas.drop(columns=["Race"], inplace=True)

When a category within a feature has very few samples (e.g., “Race”), a model might learn patterns from these specific few instances that are not generalizable to the wider population or new data. It essentially “memorizes” these rare cases and their associated outcomes, which can lead to poor performance on unseen data.

Split original dataset

X_train, X_test, y_train, y_test = train_test_split(
    gliomas.drop("Grade", axis=1),
    gliomas["Grade"],
    test_size=0.3,
    random_state=42,
    stratify=gliomas["Grade"],
)
print("X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape)
X_train: (587, 22) X_test: (252, 22) y_train: (587,) y_test: (252,)
# Visulize the distribution of the target variable in the training and training set

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.histplot(X_train[['Age_at_diagnosis']], ax=axes[0])
axes[0].set_title('Training Set')

sns.histplot(X_test[['Age_at_diagnosis']], ax=axes[1])
axes[1].set_title('Test Set')

plt.tight_layout()
plt.show()
../_images/f85c7c2f0895ef7983c865424ad1e2e289d724ad9028a60307b90f9dcf657710.png
# Fit and transform the 'age' column
## Apply simple transformation using the StandardScaler `scaler = StandardScaler()`
## directly on the 'Age_at_diagnosis' column in train and test datasets
scaler = StandardScaler()
X_train['Age_at_diagnosis'] = scaler.fit_transform(X_train[['Age_at_diagnosis']])
X_test['Age_at_diagnosis'] = scaler.transform(X_test[['Age_at_diagnosis']])
# Visulize the distribution of the target variable in the training and training set

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.histplot(X_train[['Age_at_diagnosis']], ax=axes[0])
axes[0].set_title('Training Set')

sns.histplot(X_test[['Age_at_diagnosis']], ax=axes[1])
axes[1].set_title('Test Set')

plt.suptitle('Age at Diagnosis Distribution after scaling')
plt.show()
../_images/da094d0b6f7831a11fe8ba8928c8d6ba418f006b3617a3fbc31a13c3cfbb3c80.png

Logistic regression model

lr = LogisticRegression()

# Fit the model
lr.fit(X_train, y_train)
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# Predict the Glioma type of the new dataset
lr.predict(X_test)
array([1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.])
# Examine and understand the importance of features in predicting glioma type
feature_list = gliomas.columns[1:]
print("Number of Features", len(feature_list))
print("Features", feature_list)
Number of Features 22
Features Index(['Gender', 'Age_at_diagnosis', 'IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR',
       'CIC', 'MUC16', 'PIK3CA', 'NF1', 'PIK3R1', 'FUBP1', 'RB1', 'NOTCH1',
       'BCOR', 'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4', 'PDGFRA'],
      dtype='object')
# Create a DataFrame of the coefficients an their corresponding features
coefficients = pd.DataFrame(lr.coef_.T, index=feature_list, columns=['Coefficient'])
coefficients.sort_values(by='Coefficient', ascending=False)
Coefficient
GRIN2A 1.056014
PIK3R1 0.946648
TP53 0.942277
PTEN 0.797627
Age_at_diagnosis 0.673566
MUC16 0.604058
PDGFRA 0.445156
BCOR 0.354251
CSMD3 0.273943
RB1 0.156225
PIK3CA 0.059625
Gender 0.033080
FAT4 -0.007242
ATRX -0.192868
FUBP1 -0.409964
SMARCA4 -0.414214
EGFR -0.537785
CIC -0.780298
NF1 -0.836192
NOTCH1 -1.459100
IDH2 -1.733579
IDH1 -3.286614

Evaluate model performance

# Predict on test set
y_pred = lr.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
../_images/abe1db1c5083c4c0b0715f6022ba642888b28b607b0616824db11754af40113c.png
# Generate classification report
report = classification_report(y_test, y_pred)
print(report)
              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88       146
         1.0       0.81      0.89      0.85       106

    accuracy                           0.87       252
   macro avg       0.86      0.87      0.86       252
weighted avg       0.87      0.87      0.87       252

The problems with single test dataset (holdout sets) in model validation

  • Using different random seeds can lead to different results even when using the same model and dataset

  • The variability in results makes it difficult to accurately assess the model’s true performance and generalizability

  • Cross-validation is proposed as the gold-standard solution to overcome the limitations of holdout sets in model validation

Cross-validation

  • Cross-validation is a method that involves running a single model on various training/validation combinations to get more confident final metrics

# Use KFold and create kf object
kf = KFold(n_splits=5, shuffle=True, random_state=1111)

# Create splits
## The `split` method of the KFold object will yield indices for training and validation sets
splits = kf.split(gliomas.drop("Grade", axis=1))

# Print the indices of the training and validation sets for each fold
for split, k in zip(splits, range(1, 6)):
    print("Fold %d" % k)
    train_index, val_index = split
    print("Number of training indices: %s; First 10: %s" % (len(train_index), train_index[:10]))
    print("Number of validation indices: %s; First 10: %s" % (len(val_index), val_index[:10]))
Fold 1
Number of training indices: 671; First 10: [ 2  3  5  6  8  9 10 11 12 13]
Number of validation indices: 168; First 10: [ 0  1  4  7 23 25 32 33 34 38]
Fold 2
Number of training indices: 671; First 10: [ 0  1  3  4  5  6  7  8  9 10]
Number of validation indices: 168; First 10: [ 2 12 16 24 35 39 45 49 57 61]
Fold 3
Number of training indices: 671; First 10: [ 0  1  2  3  4  6  7  8  9 10]
Number of validation indices: 168; First 10: [ 5 13 26 27 29 36 37 41 46 47]
Fold 4
Number of training indices: 671; First 10: [ 0  1  2  4  5  7  9 12 13 14]
Number of validation indices: 168; First 10: [ 3  6  8 10 11 19 22 31 40 43]
Fold 5
Number of training indices: 672; First 10: [ 0  1  2  3  4  5  6  7  8 10]
Number of validation indices: 167; First 10: [ 9 14 15 17 18 20 21 28 30 42]
X = gliomas.drop("Grade", axis=1)
y = gliomas["Grade"]


kf = KFold(n_splits=5, shuffle=True, random_state=1111)
splits = kf.split(gliomas.drop("Grade", axis=1))

kf_cv_scores = {
    "Fold": [],
    "Precision_Class_0": [],
    "Precision_Class_1": [],
    "Recall_Class_0": [],
    "Recall_Class_1": [],
    "F1_Class_0": [],
    "F1_Class_1": [],
}

# Initialize scaler
scaler = StandardScaler()
fold = 1

for train_index, val_index in splits:
    # Setup the training and validation data
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]

    # Create copies to avoid modifying original data
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    
    # Scale only the Age_at_diagnosis column
    X_train_scaled['Age_at_diagnosis'] = scaler.fit_transform(X_train[['Age_at_diagnosis']]).flatten()
    X_val_scaled['Age_at_diagnosis'] = scaler.transform(X_val[['Age_at_diagnosis']]).flatten()
    
    # Initialize the Logistic Regression model with cross-validation
    lr_cv = LogisticRegression()

    # Use the SCALED data for training and prediction
    lr_cv.fit(X_train_scaled, y_train)  # ← Now using scaled data
    predictions = lr_cv.predict(X_val_scaled)  # ← Now using scaled data

    precision = precision_score(y_val, predictions, average=None)
    recall = recall_score(y_val, predictions, average=None)
    f1 = f1_score(y_val, predictions, average=None)

    kf_cv_scores["Fold"].append(fold)
    kf_cv_scores["Precision_Class_0"].append(round(float(precision[0]),3))
    kf_cv_scores["Recall_Class_0"].append(round(float(recall[0]),3))
    kf_cv_scores["Precision_Class_1"].append(round(float(precision[1]),3))
    kf_cv_scores["Recall_Class_1"].append(round(float(recall[1]),3))
    kf_cv_scores["F1_Class_0"].append(round(float(f1[0]),3))
    kf_cv_scores["F1_Class_1"].append(round(float(f1[1]),3))
    fold += 1

kf_cv_scores
{'Fold': [1, 2, 3, 4, 5],
 'Precision_Class_0': [0.933, 0.948, 0.937, 0.862, 0.966],
 'Precision_Class_1': [0.782, 0.78, 0.795, 0.824, 0.863],
 'Recall_Class_0': [0.832, 0.785, 0.856, 0.862, 0.884],
 'Recall_Class_1': [0.91, 0.947, 0.906, 0.824, 0.958],
 'F1_Class_0': [0.88, 0.859, 0.894, 0.862, 0.923],
 'F1_Class_1': [0.841, 0.855, 0.847, 0.824, 0.908]}
kf_cv_scores_df = pd.DataFrame(kf_cv_scores)
kf_cv_scores_df
Fold Precision_Class_0 Precision_Class_1 Recall_Class_0 Recall_Class_1 F1_Class_0 F1_Class_1
0 1 0.933 0.782 0.832 0.910 0.880 0.841
1 2 0.948 0.780 0.785 0.947 0.859 0.855
2 3 0.937 0.795 0.856 0.906 0.894 0.847
3 4 0.862 0.824 0.862 0.824 0.862 0.824
4 5 0.966 0.863 0.884 0.958 0.923 0.908
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Line plot showing variation across folds
ax1.plot(kf_cv_scores['Fold'], kf_cv_scores['Precision_Class_0'], '-o', label='Precision_Class_0')
ax1.plot(kf_cv_scores['Fold'], kf_cv_scores['Precision_Class_1'], '-*', label='Precision_Class_1')

ax1.set_xlabel('Fold')
ax1.set_xticks(kf_cv_scores['Fold'])
ax1.set_ylabel('Precision Score')
ax1.set_ylim(0, 1)
ax1.legend(loc='lower right')
ax1.grid(True, alpha=0.3)

ax2.plot(kf_cv_scores['Fold'], kf_cv_scores['Recall_Class_0'], '-s', label='Recall_Class_0')
ax2.plot(kf_cv_scores['Fold'], kf_cv_scores['Recall_Class_1'], '-*', label='Recall_Class_1')

ax2.set_xlabel('Fold')
ax2.set_xticks(kf_cv_scores['Fold'])
ax2.set_ylabel('Recall Score')
ax2.set_ylim(0, 1)
ax2.legend(loc='lower right')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
../_images/7ff0a28e72dda8a14822486d032b623de7ace332b3d80ae33e233d7dbdaba71f.png
cv_summary = kf_cv_scores_df.aggregate(
    {
        "Precision_Class_0": ["min", "max", "mean", "std"],
        "Precision_Class_1": ["min", "max", "mean", "std"],
        "Recall_Class_0": ["min", "max", "mean", "std"],
        "Recall_Class_1": ["min", "max", "mean", "std"],
        "F1_Class_0": ["min", "max", "mean", "std"],
        "F1_Class_1": ["min", "max", "mean", "std"],
    }
)
cv_summary
Precision_Class_0 Precision_Class_1 Recall_Class_0 Recall_Class_1 F1_Class_0 F1_Class_1
min 0.862000 0.780000 0.785000 0.824000 0.859000 0.824000
max 0.966000 0.863000 0.884000 0.958000 0.923000 0.908000
mean 0.929200 0.808800 0.843800 0.909000 0.883600 0.855000
std 0.039682 0.035024 0.037725 0.052631 0.026197 0.031741
cv_summary.T
min max mean std
Precision_Class_0 0.862 0.966 0.9292 0.039682
Precision_Class_1 0.780 0.863 0.8088 0.035024
Recall_Class_0 0.785 0.884 0.8438 0.037725
Recall_Class_1 0.824 0.958 0.9090 0.052631
F1_Class_0 0.859 0.923 0.8836 0.026197
F1_Class_1 0.824 0.908 0.8550 0.031741

Interpretation guidelines:

  1. Mean Performance:

    • Higher mean = better overall performance

    • Compare to baseline or business requirements

  2. Standard Deviation:

    • Low SD = consistent performance across folds (good generalization)

    • High SD = variable performance (potential overfitting or data issues)

  3. Range and Min/Max:

    • Small range = consistent across different data subsets

    • Large range = sensitive to specific data characteristics

Hyperparameter Tuning

Hyperparameters:

  • Set before training begins

  • Configure model architecture/behavior

  • Not learned; require manual tuning

  • Core hyperparameters in LogisticRegression:

    • Penalty (loss): ‘l1’, ‘l2’, ‘elasticnet’, None (Default: ‘l2’)

    • Regularization strength: smaller values mean stronger regularization (Default: 1.0)

    • Solver (Algorithm to use for optimization): newton-cg, lbfgs, liblinear, sag, saga (Default: lbfgs)

  • Core Hyperparameters can

    • Directly influence model’s learning process and capabilities

    • Control model complexity, learning behavior, and prevention of overfitting

    • Changes significantly impact accuracy, generalization, and prediction quality

Additional notes (regarding parameters used in following cell):

  • C (Regularization Strength):

    • Controls how much to penalize complex models

    • Smaller values = more regularization (simpler model)

  • Loss function (penalty):

    • l2: Ridge regularization (shrinks coefficients toward zero)

    • l1: Lasso regularization (can set coefficients exactly to zero, good for feature selection)

  • solver:

    • liblinear: Good for small datasets, works with both L1 and L2

    • saga: Good for large datasets, works with both L1 and L2

    • ElasticNet: Combines L1 and L2 benefits (address multicollinearity handle groups of correlated genes)

  • max_iter:

    • Maximum number of iterations for the algorithm to converge

    • Increase if you get convergence warnings

# Get the features and target variable
X_train, X_test, y_train, y_test = train_test_split(
    gliomas.drop("Grade", axis=1),
    gliomas["Grade"],
    test_size=0.3,
    random_state=42,
    stratify=gliomas["Grade"],
)

# Create the base model
lr = LogisticRegression(random_state=42, max_iter=10000)

# Define the parameter grid
# Define a VALID parameter grid
param_grid = [
    # For l2 penalty (works with all solvers)
    {
        'penalty': ['l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
    },
    # For l1 penalty (only works with liblinear and saga)
    {
        'penalty': ['l1'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    # For elasticnet penalty (only works with saga)
    # Note: l1_ratio must be between 0 and 1
    {
        'penalty': ['elasticnet'],
        'l1_ratio': [0.1, 0.5, 0.9],
        'solver': ['saga'],
        'C': [0.1, 1, 10]

    }
]

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_macro',  # Assessment metric
    verbose=1,  # Print progress
    n_jobs=-1  # Use all CPU cores
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Access results for different metrics
print(f"Best parameters (based on f1):", grid_search.best_params_)
print(f"Best f1 score:", grid_search.best_score_)

# Make predictions with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the best model
print("\nTest set accuracy:", best_model.score(X_test, y_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Fitting 5 folds for each of 51 candidates, totalling 255 fits
Best parameters (based on f1): {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best f1 score: 0.8708974463864253

Test set accuracy: 0.8690476190476191

Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.85      0.88       146
         1.0       0.81      0.90      0.85       106

    accuracy                           0.87       252
   macro avg       0.87      0.87      0.87       252
weighted avg       0.87      0.87      0.87       252
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
../_images/190dd114d6010ce52953da6ecadc3a5517d1f315588a7d74b2350fc4c81e9cf4.png