Complete ML workflow

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import KFold

Exploratory analysis

Understand the Data Structure and Summary
1. Load and Inspect
2. Descriptive Statistics
Analyze the Target Variable
1. Check Data Type (Ensure your target variable is appropriately represented)
2. Determine the frequency of each class in your binary target variable
Analyze features
1. Visualize the distributions of features
Identify and Handle Missing Values

Understand the Data Structure and Summary

# Download the test dataset
! curl https://naicno.github.io/BioNT_Module2_handson/_downloads/d072ac9ebbb200e56a8125b33b887657/TCGA_InfoWithGrade.csv -o TCGA_InfoWithGrade.csv
! ls -lh TCGA_InfoWithGrade.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

  0 86432    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 86432  100 86432    0     0   462k      0 --:--:-- --:--:-- --:--:--  461k

-rw-r--r-- 1 runner docker 85K Jun 10 13:52 TCGA_InfoWithGrade.csv

# Read the dataset to get the glioma dataframe
gliomas = pd.read_csv('TCGA_InfoWithGrade.csv')
gliomas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 Grade             839 non-null    float64
 Gender            840 non-null    float64
 Age_at_diagnosis  839 non-null    float64
 Race              839 non-null    float64
 IDH1              839 non-null    float64
 TP53              839 non-null    float64
 ATRX              839 non-null    float64
 PTEN              839 non-null    float64
 EGFR              840 non-null    float64
 CIC               839 non-null    float64
MUC16             839 non-null    float64
PIK3CA            839 non-null    float64
NF1               839 non-null    float64
PIK3R1            840 non-null    float64
FUBP1             839 non-null    float64
RB1               839 non-null    float64
NOTCH1            840 non-null    float64
BCOR              839 non-null    float64
CSMD3             839 non-null    float64
SMARCA4           839 non-null    float64
GRIN2A            839 non-null    float64
IDH2              840 non-null    float64
FAT4              839 non-null    float64
PDGFRA            840 non-null    float64
ATRX_xNA          630 non-null    float64
IDH1_xNA          168 non-null    float64
dtypes: float64(26)
memory usage: 170.8 KB

# Inspect the first few rows of the DataFrame
gliomas.head()

	Gender	Age_at_diagnosis	IDH1	TP53	ATRX	CIC	...	FAT4	ATRX_xNA	IDH1_xNA
0	0.0	51.30	1.0	0.0	0.0	0.0	...	0.0	0.0	NaN
1	0.0	38.72	1.0	0.0	0.0	1.0	...	0.0	0.0	1.0
2	0.0	35.17	1.0	1.0	1.0	0.0	...	0.0	1.0	NaN
3	1.0	32.78	1.0	1.0	1.0	0.0	...	1.0	1.0	NaN
4	0.0	31.51	1.0	1.0	1.0	0.0	...	0.0	NaN	NaN

5 rows × 26 columns

# Quick summary statistics of the DataFrame
gliomas.describe()

	Grade	Gender	Age_at_diagnosis	Race	IDH1	TP53	ATRX	PTEN	EGFR	CIC	...	NOTCH1	BCOR	CSMD3	SMARCA4	GRIN2A	IDH2	FAT4	PDGFRA	ATRX_xNA	IDH1_xNA
count	839.000000	840.000000	839.000000	839.000000	839.000000	839.000000	839.000000	839.000000	840.000000	839.000000	...	840.000000	839.000000	839.000000	839.000000	839.000000	840.000000	839.000000	840.000000	630.000000	168.000000
mean	0.419547	0.417857	50.935411	0.107271	0.481526	0.414779	0.258641	0.168057	0.133333	0.132300	...	0.045238	0.034565	0.032181	0.032181	0.032181	0.027381	0.027414	0.026190	0.265079	0.488095
std	0.493779	0.493500	15.702339	0.369392	0.499957	0.492978	0.438149	0.374140	0.340137	0.339019	...	0.207950	0.182784	0.176586	0.176586	0.176586	0.163288	0.163383	0.159797	0.441726	0.501353
min	0.000000	0.000000	14.420000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	38.055000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	0.000000	51.550000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	1.000000	1.000000	62.800000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000
max	1.000000	1.000000	89.290000	3.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000

8 rows × 26 columns

Analyze the Target Variable

# Check data types of the target variable 'Grade'
gliomas["Grade"].dtype

dtype('float64')

# Get the counts of each grade in the 'Grade' column
gliomas["Grade"].value_counts()

Grade
0.0    487
1.0    352
Name: count, dtype: int64

# Get the fractions of each grade in the 'Grade' column
gliomas["Grade"].value_counts(normalize=True)

Grade
0.0    0.580453
1.0    0.419547
Name: proportion, dtype: float64

Analyze features

# Plot the distribution of data in all columns
fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(15, 20))
axs = axs.flatten()

# Iterate over each column and plot the distribution
for i, column in enumerate(gliomas.drop(columns=["Grade"]).columns):
    axs[i].hist(gliomas[column], bins=20, color='skyblue', edgecolor='black')
    axs[i].set_title(column)
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

../_images/c8f52d15e57ec1386ad0ddb06254df13ef40f0641c7b6f28d1c170f8e32b388a.png

Identify and Handle Missing Values

Note that np.nan values are inserted into the original dataset specifically to demonstrate techniques that help handel these datasets

# Calculate the % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100

Grade                0.119048
Gender               0.000000
Age_at_diagnosis     0.119048
Race                 0.119048
IDH1                 0.119048
TP53                 0.119048
ATRX                 0.119048
PTEN                 0.119048
EGFR                 0.000000
CIC                  0.119048
MUC16                0.119048
PIK3CA               0.119048
NF1                  0.119048
PIK3R1               0.000000
FUBP1                0.119048
RB1                  0.119048
NOTCH1               0.000000
BCOR                 0.119048
CSMD3                0.119048
SMARCA4              0.119048
GRIN2A               0.119048
IDH2                 0.000000
FAT4                 0.119048
PDGFRA               0.000000
ATRX_xNA            25.000000
IDH1_xNA            80.000000
dtype: float64

Delete rows with missing values in target variable

# Remove rows with missing values in the 'Grade' column
gliomas.dropna(subset=["Grade"], axis=0, inplace=True)

# % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100

Grade                0.000000
Gender               0.000000
Age_at_diagnosis     0.000000
Race                 0.000000
IDH1                 0.000000
TP53                 0.000000
ATRX                 0.000000
PTEN                 0.000000
EGFR                 0.000000
CIC                  0.000000
MUC16                0.000000
PIK3CA               0.000000
NF1                  0.000000
PIK3R1               0.000000
FUBP1                0.000000
RB1                  0.000000
NOTCH1               0.000000
BCOR                 0.000000
CSMD3                0.000000
SMARCA4              0.000000
GRIN2A               0.000000
IDH2                 0.000000
FAT4                 0.000000
PDGFRA               0.000000
ATRX_xNA            24.910608
IDH1_xNA            79.976162
dtype: float64

Keep columns with at least 95% non-missing values

threshold = int(0.95 * len(gliomas))
# Keep columns with at least 95% non-missing values
gliomas.dropna(thresh=threshold, axis=1, inplace=True)

# % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100

Grade               0.0
Gender              0.0
Age_at_diagnosis    0.0
Race                0.0
IDH1                0.0
TP53                0.0
ATRX                0.0
PTEN                0.0
EGFR                0.0
CIC                 0.0
MUC16               0.0
PIK3CA              0.0
NF1                 0.0
PIK3R1              0.0
FUBP1               0.0
RB1                 0.0
NOTCH1              0.0
BCOR                0.0
CSMD3               0.0
SMARCA4             0.0
GRIN2A              0.0
IDH2                0.0
FAT4                0.0
PDGFRA              0.0
dtype: float64

# Plot the distribution of data in all columns
fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(15, 20))
axs = axs.flatten()

# Iterate over each column and plot the distribution
for i, column in enumerate(gliomas.drop(columns=["Grade"]).columns):
    axs[i].hist(gliomas[column], bins=20, color='skyblue', edgecolor='black')
    axs[i].set_title(column)
    axs[i].set_xlabel('Value')
    axs[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

../_images/f7a1df908b3695b7ec5796b2f486fb7ea39031816d717da42f46b9f7aa2258ec.png

# Drop the 'Race' column
gliomas.drop(columns=["Race"], inplace=True)

When a category within a feature has very few samples (e.g., “Race”), a model might learn patterns from these specific few instances that are not generalizable to the wider population or new data. It essentially “memorizes” these rare cases and their associated outcomes, which can lead to poor performance on unseen data.

Split original dataset

X_train, X_test, y_train, y_test = train_test_split(
    gliomas.drop("Grade", axis=1),
    gliomas["Grade"],
    test_size=0.3,
    random_state=42,
    stratify=gliomas["Grade"],
)
print("X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape)

X_train: (587, 22) X_test: (252, 22) y_train: (587,) y_test: (252,)

# Visulize the distribution of the target variable in the training and training set

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.histplot(X_train[['Age_at_diagnosis']], ax=axes[0])
axes[0].set_title('Training Set')

sns.histplot(X_test[['Age_at_diagnosis']], ax=axes[1])
axes[1].set_title('Test Set')

plt.tight_layout()
plt.show()

../_images/f85c7c2f0895ef7983c865424ad1e2e289d724ad9028a60307b90f9dcf657710.png

# Fit and transform the 'age' column
## Apply simple transformation using the StandardScaler `scaler = StandardScaler()`
## directly on the 'Age_at_diagnosis' column in train and test datasets
scaler = StandardScaler()
X_train['Age_at_diagnosis'] = scaler.fit_transform(X_train[['Age_at_diagnosis']])
X_test['Age_at_diagnosis'] = scaler.transform(X_test[['Age_at_diagnosis']])

# Visulize the distribution of the target variable in the training and training set

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

sns.histplot(X_train[['Age_at_diagnosis']], ax=axes[0])
axes[0].set_title('Training Set')

sns.histplot(X_test[['Age_at_diagnosis']], ax=axes[1])
axes[1].set_title('Test Set')

plt.suptitle('Age at Diagnosis Distribution after scaling')
plt.show()

../_images/da094d0b6f7831a11fe8ba8928c8d6ba418f006b3617a3fbc31a13c3cfbb3c80.png

Logistic regression model

lr = LogisticRegression()

# Fit the model
lr.fit(X_train, y_train)

LogisticRegression()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

# Predict the Glioma type of the new dataset
lr.predict(X_test)

array([1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0.,
       1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1.,
       0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
       1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
       0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
       1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
       0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
       1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.])

# Examine and understand the importance of features in predicting glioma type
feature_list = gliomas.columns[1:]
print("Number of Features", len(feature_list))
print("Features", feature_list)

Number of Features 22
Features Index(['Gender', 'Age_at_diagnosis', 'IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR',
       'CIC', 'MUC16', 'PIK3CA', 'NF1', 'PIK3R1', 'FUBP1', 'RB1', 'NOTCH1',
       'BCOR', 'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4', 'PDGFRA'],
      dtype='object')

# Create a DataFrame of the coefficients an their corresponding features
coefficients = pd.DataFrame(lr.coef_.T, index=feature_list, columns=['Coefficient'])
coefficients.sort_values(by='Coefficient', ascending=False)

	Coefficient
GRIN2A	1.056014
PIK3R1	0.946648
TP53	0.942277
PTEN	0.797627
Age_at_diagnosis	0.673566
MUC16	0.604058
PDGFRA	0.445156
BCOR	0.354251
CSMD3	0.273943
RB1	0.156225
PIK3CA	0.059625
Gender	0.033080
FAT4	-0.007242
ATRX	-0.192868
FUBP1	-0.409964
SMARCA4	-0.414214
EGFR	-0.537785
CIC	-0.780298
NF1	-0.836192
NOTCH1	-1.459100
IDH2	-1.733579
IDH1	-3.286614

Evaluate model performance

# Predict on test set
y_pred = lr.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

../_images/abe1db1c5083c4c0b0715f6022ba642888b28b607b0616824db11754af40113c.png

# Generate classification report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

         0.0       0.91      0.85      0.88       146
         1.0       0.81      0.89      0.85       106

    accuracy                           0.87       252
   macro avg       0.86      0.87      0.86       252
weighted avg       0.87      0.87      0.87       252

The problems with single test dataset (holdout sets) in model validation

Using different random seeds can lead to different results even when using the same model and dataset
The variability in results makes it difficult to accurately assess the model’s true performance and generalizability
Cross-validation is proposed as the gold-standard solution to overcome the limitations of holdout sets in model validation

Cross-validation

Cross-validation is a method that involves running a single model on various training/validation combinations to get more confident final metrics

# Use KFold and create kf object
kf = KFold(n_splits=5, shuffle=True, random_state=1111)

# Create splits
## The `split` method of the KFold object will yield indices for training and validation sets
splits = kf.split(gliomas.drop("Grade", axis=1))

# Print the indices of the training and validation sets for each fold
for split, k in zip(splits, range(1, 6)):
    print("Fold %d" % k)
    train_index, val_index = split
    print("Number of training indices: %s; First 10: %s" % (len(train_index), train_index[:10]))
    print("Number of validation indices: %s; First 10: %s" % (len(val_index), val_index[:10]))

Fold 1
Number of training indices: 671; First 10: [ 2  3  5  6  8  9 10 11 12 13]
Number of validation indices: 168; First 10: [ 0  1  4  7 23 25 32 33 34 38]
Fold 2
Number of training indices: 671; First 10: [ 0  1  3  4  5  6  7  8  9 10]
Number of validation indices: 168; First 10: [ 2 12 16 24 35 39 45 49 57 61]
Fold 3
Number of training indices: 671; First 10: [ 0  1  2  3  4  6  7  8  9 10]
Number of validation indices: 168; First 10: [ 5 13 26 27 29 36 37 41 46 47]
Fold 4
Number of training indices: 671; First 10: [ 0  1  2  4  5  7  9 12 13 14]
Number of validation indices: 168; First 10: [ 3  6  8 10 11 19 22 31 40 43]
Fold 5
Number of training indices: 672; First 10: [ 0  1  2  3  4  5  6  7  8 10]
Number of validation indices: 167; First 10: [ 9 14 15 17 18 20 21 28 30 42]

X = gliomas.drop("Grade", axis=1)
y = gliomas["Grade"]


kf = KFold(n_splits=5, shuffle=True, random_state=1111)
splits = kf.split(gliomas.drop("Grade", axis=1))

kf_cv_scores = {
    "Fold": [],
    "Precision_Class_0": [],
    "Precision_Class_1": [],
    "Recall_Class_0": [],
    "Recall_Class_1": [],
    "F1_Class_0": [],
    "F1_Class_1": [],
}

# Initialize scaler
scaler = StandardScaler()
fold = 1

for train_index, val_index in splits:
    # Setup the training and validation data
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]

    # Create copies to avoid modifying original data
    X_train_scaled = X_train.copy()
    X_val_scaled = X_val.copy()
    
    # Scale only the Age_at_diagnosis column
    X_train_scaled['Age_at_diagnosis'] = scaler.fit_transform(X_train[['Age_at_diagnosis']]).flatten()
    X_val_scaled['Age_at_diagnosis'] = scaler.transform(X_val[['Age_at_diagnosis']]).flatten()
    
    # Initialize the Logistic Regression model with cross-validation
    lr_cv = LogisticRegression()

    # Use the SCALED data for training and prediction
    lr_cv.fit(X_train_scaled, y_train)  # ← Now using scaled data
    predictions = lr_cv.predict(X_val_scaled)  # ← Now using scaled data

    precision = precision_score(y_val, predictions, average=None)
    recall = recall_score(y_val, predictions, average=None)
    f1 = f1_score(y_val, predictions, average=None)

    kf_cv_scores["Fold"].append(fold)
    kf_cv_scores["Precision_Class_0"].append(round(float(precision[0]),3))
    kf_cv_scores["Recall_Class_0"].append(round(float(recall[0]),3))
    kf_cv_scores["Precision_Class_1"].append(round(float(precision[1]),3))
    kf_cv_scores["Recall_Class_1"].append(round(float(recall[1]),3))
    kf_cv_scores["F1_Class_0"].append(round(float(f1[0]),3))
    kf_cv_scores["F1_Class_1"].append(round(float(f1[1]),3))
    fold += 1

kf_cv_scores

{'Fold': [1, 2, 3, 4, 5],
 'Precision_Class_0': [0.933, 0.948, 0.937, 0.862, 0.966],
 'Precision_Class_1': [0.782, 0.78, 0.795, 0.824, 0.863],
 'Recall_Class_0': [0.832, 0.785, 0.856, 0.862, 0.884],
 'Recall_Class_1': [0.91, 0.947, 0.906, 0.824, 0.958],
 'F1_Class_0': [0.88, 0.859, 0.894, 0.862, 0.923],
 'F1_Class_1': [0.841, 0.855, 0.847, 0.824, 0.908]}

kf_cv_scores_df = pd.DataFrame(kf_cv_scores)
kf_cv_scores_df

	Fold	Precision_Class_0	Precision_Class_1	Recall_Class_0	Recall_Class_1	F1_Class_0	F1_Class_1
0	1	0.933	0.782	0.832	0.910	0.880	0.841
1	2	0.948	0.780	0.785	0.947	0.859	0.855
2	3	0.937	0.795	0.856	0.906	0.894	0.847
3	4	0.862	0.824	0.862	0.824	0.862	0.824
4	5	0.966	0.863	0.884	0.958	0.923	0.908

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Line plot showing variation across folds
ax1.plot(kf_cv_scores['Fold'], kf_cv_scores['Precision_Class_0'], '-o', label='Precision_Class_0')
ax1.plot(kf_cv_scores['Fold'], kf_cv_scores['Precision_Class_1'], '-*', label='Precision_Class_1')

ax1.set_xlabel('Fold')
ax1.set_xticks(kf_cv_scores['Fold'])
ax1.set_ylabel('Precision Score')
ax1.set_ylim(0, 1)
ax1.legend(loc='lower right')
ax1.grid(True, alpha=0.3)

ax2.plot(kf_cv_scores['Fold'], kf_cv_scores['Recall_Class_0'], '-s', label='Recall_Class_0')
ax2.plot(kf_cv_scores['Fold'], kf_cv_scores['Recall_Class_1'], '-*', label='Recall_Class_1')

ax2.set_xlabel('Fold')
ax2.set_xticks(kf_cv_scores['Fold'])
ax2.set_ylabel('Recall Score')
ax2.set_ylim(0, 1)
ax2.legend(loc='lower right')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

../_images/7ff0a28e72dda8a14822486d032b623de7ace332b3d80ae33e233d7dbdaba71f.png

cv_summary = kf_cv_scores_df.aggregate(
    {
        "Precision_Class_0": ["min", "max", "mean", "std"],
        "Precision_Class_1": ["min", "max", "mean", "std"],
        "Recall_Class_0": ["min", "max", "mean", "std"],
        "Recall_Class_1": ["min", "max", "mean", "std"],
        "F1_Class_0": ["min", "max", "mean", "std"],
        "F1_Class_1": ["min", "max", "mean", "std"],
    }
)
cv_summary

	Precision_Class_0	Precision_Class_1	Recall_Class_0	Recall_Class_1	F1_Class_0	F1_Class_1
min	0.862000	0.780000	0.785000	0.824000	0.859000	0.824000
max	0.966000	0.863000	0.884000	0.958000	0.923000	0.908000
mean	0.929200	0.808800	0.843800	0.909000	0.883600	0.855000
std	0.039682	0.035024	0.037725	0.052631	0.026197	0.031741

cv_summary.T

	min	max	mean	std
Precision_Class_0	0.862	0.966	0.9292	0.039682
Precision_Class_1	0.780	0.863	0.8088	0.035024
Recall_Class_0	0.785	0.884	0.8438	0.037725
Recall_Class_1	0.824	0.958	0.9090	0.052631
F1_Class_0	0.859	0.923	0.8836	0.026197
F1_Class_1	0.824	0.908	0.8550	0.031741

Interpretation guidelines:

Mean Performance:
- Higher mean = better overall performance
- Compare to baseline or business requirements
Standard Deviation:
- Low SD = consistent performance across folds (good generalization)
- High SD = variable performance (potential overfitting or data issues)
Range and Min/Max:
- Small range = consistent across different data subsets
- Large range = sensitive to specific data characteristics

Hyperparameter Tuning

Hyperparameters:

Set before training begins
Configure model architecture/behavior
Not learned; require manual tuning
Core hyperparameters in LogisticRegression:
- Penalty (loss): ‘l1’, ‘l2’, ‘elasticnet’, None (Default: ‘l2’)
- Regularization strength: smaller values mean stronger regularization (Default: 1.0)
- Solver (Algorithm to use for optimization): newton-cg, lbfgs, liblinear, sag, saga (Default: lbfgs)
Core Hyperparameters can
- Directly influence model’s learning process and capabilities
- Control model complexity, learning behavior, and prevention of overfitting
- Changes significantly impact accuracy, generalization, and prediction quality

Additional notes (regarding parameters used in following cell):

C (Regularization Strength):
- Controls how much to penalize complex models
- Smaller values = more regularization (simpler model)
Loss function (penalty):
- l2: Ridge regularization (shrinks coefficients toward zero)
- l1: Lasso regularization (can set coefficients exactly to zero, good for feature selection)
solver:
- liblinear: Good for small datasets, works with both L1 and L2
- saga: Good for large datasets, works with both L1 and L2
- ElasticNet: Combines L1 and L2 benefits (address multicollinearity handle groups of correlated genes)
max_iter:
- Maximum number of iterations for the algorithm to converge
- Increase if you get convergence warnings

# Get the features and target variable
X_train, X_test, y_train, y_test = train_test_split(
    gliomas.drop("Grade", axis=1),
    gliomas["Grade"],
    test_size=0.3,
    random_state=42,
    stratify=gliomas["Grade"],
)

# Create the base model
lr = LogisticRegression(random_state=42, max_iter=10000)

# Define the parameter grid
# Define a VALID parameter grid
param_grid = [
    # For l2 penalty (works with all solvers)
    {
        'penalty': ['l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
    },
    # For l1 penalty (only works with liblinear and saga)
    {
        'penalty': ['l1'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    },
    # For elasticnet penalty (only works with saga)
    # Note: l1_ratio must be between 0 and 1
    {
        'penalty': ['elasticnet'],
        'l1_ratio': [0.1, 0.5, 0.9],
        'solver': ['saga'],
        'C': [0.1, 1, 10]

    }
]

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=lr,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='f1_macro',  # Assessment metric
    verbose=1,  # Print progress
    n_jobs=-1  # Use all CPU cores
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Access results for different metrics
print(f"Best parameters (based on f1):", grid_search.best_params_)
print(f"Best f1 score:", grid_search.best_score_)

# Make predictions with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the best model
print("\nTest set accuracy:", best_model.score(X_test, y_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 51 candidates, totalling 255 fits

Best parameters (based on f1): {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best f1 score: 0.8708974463864253

Test set accuracy: 0.8690476190476191

Classification Report:
              precision    recall  f1-score   support

         0.0       0.92      0.85      0.88       146
         1.0       0.81      0.90      0.85       106

    accuracy                           0.87       252
   macro avg       0.87      0.87      0.87       252
weighted avg       0.87      0.87      0.87       252

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

../_images/190dd114d6010ce52953da6ecadc3a5517d1f315588a7d74b2350fc4c81e9cf4.png

	penalty	'l2'
	dual	False
	tol	0.0001
	C	1.0
	fit_intercept	True
	intercept_scaling	1
	class_weight	None
	random_state	None
	solver	'lbfgs'
	max_iter	100
	multi_class	'deprecated'
	verbose	0
	warm_start	False
	n_jobs	None
	l1_ratio	None