Complete ML workflow
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.model_selection import KFold
Exploratory analysis
Understand the Data Structure and Summary
Load and Inspect
Descriptive Statistics
Analyze the Target Variable
Check Data Type (Ensure your target variable is appropriately represented)
Determine the frequency of each class in your binary target variable
Analyze features
Visualize the distributions of features
Identify and Handle Missing Values
Understand the Data Structure and Summary
# Download the test dataset
! curl https://naicno.github.io/BioNT_Module2_handson/_downloads/d072ac9ebbb200e56a8125b33b887657/TCGA_InfoWithGrade.csv -o TCGA_InfoWithGrade.csv
! ls -lh TCGA_InfoWithGrade.csv
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0
100 86432 100 86432 0 0 426k 0 --:--:-- --:--:-- --:--:-- 428k
-rw-r--r-- 1 runner docker 85K Jun 5 12:07 TCGA_InfoWithGrade.csv
# Read the dataset to get the glioma dataframe
gliomas = pd.read_csv('TCGA_InfoWithGrade.csv')
gliomas.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Grade 839 non-null float64
1 Gender 840 non-null float64
2 Age_at_diagnosis 839 non-null float64
3 Race 839 non-null float64
4 IDH1 839 non-null float64
5 TP53 839 non-null float64
6 ATRX 839 non-null float64
7 PTEN 839 non-null float64
8 EGFR 840 non-null float64
9 CIC 839 non-null float64
10 MUC16 839 non-null float64
11 PIK3CA 839 non-null float64
12 NF1 839 non-null float64
13 PIK3R1 840 non-null float64
14 FUBP1 839 non-null float64
15 RB1 839 non-null float64
16 NOTCH1 840 non-null float64
17 BCOR 839 non-null float64
18 CSMD3 839 non-null float64
19 SMARCA4 839 non-null float64
20 GRIN2A 839 non-null float64
21 IDH2 840 non-null float64
22 FAT4 839 non-null float64
23 PDGFRA 840 non-null float64
24 ATRX_xNA 630 non-null float64
25 IDH1_xNA 168 non-null float64
dtypes: float64(26)
memory usage: 170.8 KB
# Inspect the first few rows of the DataFrame
gliomas.head()
Grade | Gender | Age_at_diagnosis | Race | IDH1 | TP53 | ATRX | PTEN | EGFR | CIC | ... | NOTCH1 | BCOR | CSMD3 | SMARCA4 | GRIN2A | IDH2 | FAT4 | PDGFRA | ATRX_xNA | IDH1_xNA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 51.30 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN |
1 | 0.0 | 0.0 | 38.72 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
2 | 0.0 | 0.0 | 35.17 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | NaN |
3 | 0.0 | 1.0 | 32.78 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | NaN |
4 | 0.0 | 0.0 | 31.51 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | NaN |
5 rows × 26 columns
# Quick summary statistics of the DataFrame
gliomas.describe()
Grade | Gender | Age_at_diagnosis | Race | IDH1 | TP53 | ATRX | PTEN | EGFR | CIC | ... | NOTCH1 | BCOR | CSMD3 | SMARCA4 | GRIN2A | IDH2 | FAT4 | PDGFRA | ATRX_xNA | IDH1_xNA | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 839.000000 | 840.000000 | 839.000000 | 839.000000 | 839.000000 | 839.000000 | 839.000000 | 839.000000 | 840.000000 | 839.000000 | ... | 840.000000 | 839.000000 | 839.000000 | 839.000000 | 839.000000 | 840.000000 | 839.000000 | 840.000000 | 630.000000 | 168.000000 |
mean | 0.419547 | 0.417857 | 50.935411 | 0.107271 | 0.481526 | 0.414779 | 0.258641 | 0.168057 | 0.133333 | 0.132300 | ... | 0.045238 | 0.034565 | 0.032181 | 0.032181 | 0.032181 | 0.027381 | 0.027414 | 0.026190 | 0.265079 | 0.488095 |
std | 0.493779 | 0.493500 | 15.702339 | 0.369392 | 0.499957 | 0.492978 | 0.438149 | 0.374140 | 0.340137 | 0.339019 | ... | 0.207950 | 0.182784 | 0.176586 | 0.176586 | 0.176586 | 0.163288 | 0.163383 | 0.159797 | 0.441726 | 0.501353 |
min | 0.000000 | 0.000000 | 14.420000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 38.055000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
50% | 0.000000 | 0.000000 | 51.550000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
75% | 1.000000 | 1.000000 | 62.800000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 |
max | 1.000000 | 1.000000 | 89.290000 | 3.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 26 columns
Analyze the Target Variable
# Check data types of the target variable 'Grade'
gliomas["Grade"].dtype
dtype('float64')
# Get the counts of each grade in the 'Grade' column
gliomas["Grade"].value_counts()
Grade
0.0 487
1.0 352
Name: count, dtype: int64
# Get the fractions of each grade in the 'Grade' column
gliomas["Grade"].value_counts(normalize=True)
Grade
0.0 0.580453
1.0 0.419547
Name: proportion, dtype: float64
Analyze features
# Plot the distribution of data in all columns
fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(15, 20))
axs = axs.flatten()
# Iterate over each column and plot the distribution
for i, column in enumerate(gliomas.drop(columns=["Grade"]).columns):
axs[i].hist(gliomas[column], bins=20, color='skyblue', edgecolor='black')
axs[i].set_title(column)
axs[i].set_xlabel('Value')
axs[i].set_ylabel('Frequency')
plt.tight_layout()
plt.show()

Identify and Handle Missing Values
Note that np.nan values are inserted into the original dataset specifically to demonstrate techniques that help handel these datasets
# Calculate the % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100
Grade 0.119048
Gender 0.000000
Age_at_diagnosis 0.119048
Race 0.119048
IDH1 0.119048
TP53 0.119048
ATRX 0.119048
PTEN 0.119048
EGFR 0.000000
CIC 0.119048
MUC16 0.119048
PIK3CA 0.119048
NF1 0.119048
PIK3R1 0.000000
FUBP1 0.119048
RB1 0.119048
NOTCH1 0.000000
BCOR 0.119048
CSMD3 0.119048
SMARCA4 0.119048
GRIN2A 0.119048
IDH2 0.000000
FAT4 0.119048
PDGFRA 0.000000
ATRX_xNA 25.000000
IDH1_xNA 80.000000
dtype: float64
Delete rows with missing values in target variable
# Remove rows with missing values in the 'Grade' column
gliomas.dropna(subset=["Grade"], axis=0, inplace=True)
# % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100
Grade 0.000000
Gender 0.000000
Age_at_diagnosis 0.000000
Race 0.000000
IDH1 0.000000
TP53 0.000000
ATRX 0.000000
PTEN 0.000000
EGFR 0.000000
CIC 0.000000
MUC16 0.000000
PIK3CA 0.000000
NF1 0.000000
PIK3R1 0.000000
FUBP1 0.000000
RB1 0.000000
NOTCH1 0.000000
BCOR 0.000000
CSMD3 0.000000
SMARCA4 0.000000
GRIN2A 0.000000
IDH2 0.000000
FAT4 0.000000
PDGFRA 0.000000
ATRX_xNA 24.910608
IDH1_xNA 79.976162
dtype: float64
Keep columns with at least 95% non-missing values
threshold = int(0.95 * len(gliomas))
# Keep columns with at least 95% non-missing values
gliomas.dropna(thresh=threshold, axis=1, inplace=True)
# % of missing values in each column
gliomas.isna().sum()/len(gliomas)*100
Grade 0.0
Gender 0.0
Age_at_diagnosis 0.0
Race 0.0
IDH1 0.0
TP53 0.0
ATRX 0.0
PTEN 0.0
EGFR 0.0
CIC 0.0
MUC16 0.0
PIK3CA 0.0
NF1 0.0
PIK3R1 0.0
FUBP1 0.0
RB1 0.0
NOTCH1 0.0
BCOR 0.0
CSMD3 0.0
SMARCA4 0.0
GRIN2A 0.0
IDH2 0.0
FAT4 0.0
PDGFRA 0.0
dtype: float64
# Plot the distribution of data in all columns
fig, axs = plt.subplots(nrows=5, ncols=5, figsize=(15, 20))
axs = axs.flatten()
# Iterate over each column and plot the distribution
for i, column in enumerate(gliomas.drop(columns=["Grade"]).columns):
axs[i].hist(gliomas[column], bins=20, color='skyblue', edgecolor='black')
axs[i].set_title(column)
axs[i].set_xlabel('Value')
axs[i].set_ylabel('Frequency')
plt.tight_layout()
plt.show()

# Drop the 'Race' column
gliomas.drop(columns=["Race"], inplace=True)
When a category within a feature has very few samples (e.g., “Race”), a model might learn patterns from these specific few instances that are not generalizable to the wider population or new data. It essentially “memorizes” these rare cases and their associated outcomes, which can lead to poor performance on unseen data.
Split original dataset
X_train, X_test, y_train, y_test = train_test_split(
gliomas.drop("Grade", axis=1),
gliomas["Grade"],
test_size=0.3,
random_state=42,
stratify=gliomas["Grade"],
)
print("X_train:", X_train.shape, "X_test:", X_test.shape, "y_train:", y_train.shape, "y_test:", y_test.shape)
X_train: (587, 22) X_test: (252, 22) y_train: (587,) y_test: (252,)
# Visulize the distribution of the target variable in the training and training set
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.histplot(X_train[['Age_at_diagnosis']], ax=axes[0])
axes[0].set_title('Training Set')
sns.histplot(X_test[['Age_at_diagnosis']], ax=axes[1])
axes[1].set_title('Test Set')
plt.tight_layout()
plt.show()

# Fit and transform the 'age' column
## Apply simple transformation using the StandardScaler `scaler = StandardScaler()`
## directly on the 'Age_at_diagnosis' column in train and test datasets
scaler = StandardScaler()
X_train['Age_at_diagnosis'] = scaler.fit_transform(X_train[['Age_at_diagnosis']])
X_test['Age_at_diagnosis'] = scaler.transform(X_test[['Age_at_diagnosis']])
# Visulize the distribution of the target variable in the training and training set
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
sns.histplot(X_train[['Age_at_diagnosis']], ax=axes[0])
axes[0].set_title('Training Set')
sns.histplot(X_test[['Age_at_diagnosis']], ax=axes[1])
axes[1].set_title('Test Set')
plt.suptitle('Age at Diagnosis Distribution after scaling')
plt.show()

Logistic regression model
lr = LogisticRegression()
# Fit the model
lr.fit(X_train, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# Predict the Glioma type of the new dataset
lr.predict(X_test)
array([1., 1., 0., 1., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0.,
1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1.,
0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 1., 0.,
1., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
0., 0., 1., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
0., 1., 1., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 1.,
0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 1., 1.,
0., 0., 0., 1., 1., 0., 0., 1., 1., 1., 1., 0., 0., 1., 0., 0., 0.,
1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 1.,
0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
1., 0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 1.,
1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0.,
0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0.])
# Examine and understand the importance of features in predicting glioma type
feature_list = gliomas.columns[1:]
print("Number of Features", len(feature_list))
print("Features", feature_list)
Number of Features 22
Features Index(['Gender', 'Age_at_diagnosis', 'IDH1', 'TP53', 'ATRX', 'PTEN', 'EGFR',
'CIC', 'MUC16', 'PIK3CA', 'NF1', 'PIK3R1', 'FUBP1', 'RB1', 'NOTCH1',
'BCOR', 'CSMD3', 'SMARCA4', 'GRIN2A', 'IDH2', 'FAT4', 'PDGFRA'],
dtype='object')
# Create a DataFrame of the coefficients an their corresponding features
coefficients = pd.DataFrame(lr.coef_.T, index=feature_list, columns=['Coefficient'])
coefficients.sort_values(by='Coefficient', ascending=False)
Coefficient | |
---|---|
GRIN2A | 1.056014 |
PIK3R1 | 0.946648 |
TP53 | 0.942277 |
PTEN | 0.797627 |
Age_at_diagnosis | 0.673566 |
MUC16 | 0.604058 |
PDGFRA | 0.445156 |
BCOR | 0.354251 |
CSMD3 | 0.273943 |
RB1 | 0.156225 |
PIK3CA | 0.059625 |
Gender | 0.033080 |
FAT4 | -0.007242 |
ATRX | -0.192868 |
FUBP1 | -0.409964 |
SMARCA4 | -0.414214 |
EGFR | -0.537785 |
CIC | -0.780298 |
NF1 | -0.836192 |
NOTCH1 | -1.459100 |
IDH2 | -1.733579 |
IDH1 | -3.286614 |
Evaluate model performance
# Predict on test set
y_pred = lr.predict(X_test)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Generate classification report
report = classification_report(y_test, y_pred)
print(report)
precision recall f1-score support
0.0 0.91 0.85 0.88 146
1.0 0.81 0.89 0.85 106
accuracy 0.87 252
macro avg 0.86 0.87 0.86 252
weighted avg 0.87 0.87 0.87 252
The problems with single test dataset (holdout sets) in model validation
Using different random seeds can lead to different results even when using the same model and dataset
The variability in results makes it difficult to accurately assess the model’s true performance and generalizability
Cross-validation is proposed as the gold-standard solution to overcome the limitations of holdout sets in model validation
Cross-validation
Cross-validation is a method that involves running a single model on various training/validation combinations to get more confident final metrics
# Use KFold and create kf object
kf = KFold(n_splits=5, shuffle=True, random_state=1111)
# Create splits
## The `split` method of the KFold object will yield indices for training and validation sets
splits = kf.split(gliomas.drop("Grade", axis=1))
# Print the indices of the training and validation sets for each fold
for split, k in zip(splits, range(1, 6)):
print("Fold %d" % k)
train_index, val_index = split
print("Number of training indices: %s; First 10: %s" % (len(train_index), train_index[:10]))
print("Number of validation indices: %s; First 10: %s" % (len(val_index), val_index[:10]))
Fold 1
Number of training indices: 671; First 10: [ 2 3 5 6 8 9 10 11 12 13]
Number of validation indices: 168; First 10: [ 0 1 4 7 23 25 32 33 34 38]
Fold 2
Number of training indices: 671; First 10: [ 0 1 3 4 5 6 7 8 9 10]
Number of validation indices: 168; First 10: [ 2 12 16 24 35 39 45 49 57 61]
Fold 3
Number of training indices: 671; First 10: [ 0 1 2 3 4 6 7 8 9 10]
Number of validation indices: 168; First 10: [ 5 13 26 27 29 36 37 41 46 47]
Fold 4
Number of training indices: 671; First 10: [ 0 1 2 4 5 7 9 12 13 14]
Number of validation indices: 168; First 10: [ 3 6 8 10 11 19 22 31 40 43]
Fold 5
Number of training indices: 672; First 10: [ 0 1 2 3 4 5 6 7 8 10]
Number of validation indices: 167; First 10: [ 9 14 15 17 18 20 21 28 30 42]
X = gliomas.drop("Grade", axis=1)
y = gliomas["Grade"]
kf = KFold(n_splits=5, shuffle=True, random_state=1111)
splits = kf.split(gliomas.drop("Grade", axis=1))
kf_cv_scores = {
"Fold": [],
"Precision_Class_0": [],
"Precision_Class_1": [],
"Recall_Class_0": [],
"Recall_Class_1": [],
"F1_Class_0": [],
"F1_Class_1": [],
}
# Initialize scaler
scaler = StandardScaler()
fold = 1
for train_index, val_index in splits:
# Setup the training and validation data
X_train, y_train = X.iloc[train_index], y.iloc[train_index]
X_val, y_val = X.iloc[val_index], y.iloc[val_index]
# Create copies to avoid modifying original data
X_train_scaled = X_train.copy()
X_val_scaled = X_val.copy()
# Scale only the Age_at_diagnosis column
X_train_scaled['Age_at_diagnosis'] = scaler.fit_transform(X_train[['Age_at_diagnosis']]).flatten()
X_val_scaled['Age_at_diagnosis'] = scaler.transform(X_val[['Age_at_diagnosis']]).flatten()
# Initialize the Logistic Regression model with cross-validation
lr_cv = LogisticRegression()
# Use the SCALED data for training and prediction
lr_cv.fit(X_train_scaled, y_train) # ← Now using scaled data
predictions = lr_cv.predict(X_val_scaled) # ← Now using scaled data
precision = precision_score(y_val, predictions, average=None)
recall = recall_score(y_val, predictions, average=None)
f1 = f1_score(y_val, predictions, average=None)
kf_cv_scores["Fold"].append(fold)
kf_cv_scores["Precision_Class_0"].append(round(float(precision[0]),3))
kf_cv_scores["Recall_Class_0"].append(round(float(recall[0]),3))
kf_cv_scores["Precision_Class_1"].append(round(float(precision[1]),3))
kf_cv_scores["Recall_Class_1"].append(round(float(recall[1]),3))
kf_cv_scores["F1_Class_0"].append(round(float(f1[0]),3))
kf_cv_scores["F1_Class_1"].append(round(float(f1[1]),3))
fold += 1
kf_cv_scores
{'Fold': [1, 2, 3, 4, 5],
'Precision_Class_0': [0.933, 0.948, 0.937, 0.862, 0.966],
'Precision_Class_1': [0.782, 0.78, 0.795, 0.824, 0.863],
'Recall_Class_0': [0.832, 0.785, 0.856, 0.862, 0.884],
'Recall_Class_1': [0.91, 0.947, 0.906, 0.824, 0.958],
'F1_Class_0': [0.88, 0.859, 0.894, 0.862, 0.923],
'F1_Class_1': [0.841, 0.855, 0.847, 0.824, 0.908]}
kf_cv_scores_df = pd.DataFrame(kf_cv_scores)
kf_cv_scores_df
Fold | Precision_Class_0 | Precision_Class_1 | Recall_Class_0 | Recall_Class_1 | F1_Class_0 | F1_Class_1 | |
---|---|---|---|---|---|---|---|
0 | 1 | 0.933 | 0.782 | 0.832 | 0.910 | 0.880 | 0.841 |
1 | 2 | 0.948 | 0.780 | 0.785 | 0.947 | 0.859 | 0.855 |
2 | 3 | 0.937 | 0.795 | 0.856 | 0.906 | 0.894 | 0.847 |
3 | 4 | 0.862 | 0.824 | 0.862 | 0.824 | 0.862 | 0.824 |
4 | 5 | 0.966 | 0.863 | 0.884 | 0.958 | 0.923 | 0.908 |
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# Line plot showing variation across folds
ax1.plot(kf_cv_scores['Fold'], kf_cv_scores['Precision_Class_0'], '-o', label='Precision_Class_0')
ax1.plot(kf_cv_scores['Fold'], kf_cv_scores['Precision_Class_1'], '-*', label='Precision_Class_1')
ax1.set_xlabel('Fold')
ax1.set_xticks(kf_cv_scores['Fold'])
ax1.set_ylabel('Precision Score')
ax1.set_ylim(0, 1)
ax1.legend(loc='lower right')
ax1.grid(True, alpha=0.3)
ax2.plot(kf_cv_scores['Fold'], kf_cv_scores['Recall_Class_0'], '-s', label='Recall_Class_0')
ax2.plot(kf_cv_scores['Fold'], kf_cv_scores['Recall_Class_1'], '-*', label='Recall_Class_1')
ax2.set_xlabel('Fold')
ax2.set_xticks(kf_cv_scores['Fold'])
ax2.set_ylabel('Recall Score')
ax2.set_ylim(0, 1)
ax2.legend(loc='lower right')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

cv_summary = kf_cv_scores_df.aggregate(
{
"Precision_Class_0": ["min", "max", "mean", "std"],
"Precision_Class_1": ["min", "max", "mean", "std"],
"Recall_Class_0": ["min", "max", "mean", "std"],
"Recall_Class_1": ["min", "max", "mean", "std"],
"F1_Class_0": ["min", "max", "mean", "std"],
"F1_Class_1": ["min", "max", "mean", "std"],
}
)
cv_summary
Precision_Class_0 | Precision_Class_1 | Recall_Class_0 | Recall_Class_1 | F1_Class_0 | F1_Class_1 | |
---|---|---|---|---|---|---|
min | 0.862000 | 0.780000 | 0.785000 | 0.824000 | 0.859000 | 0.824000 |
max | 0.966000 | 0.863000 | 0.884000 | 0.958000 | 0.923000 | 0.908000 |
mean | 0.929200 | 0.808800 | 0.843800 | 0.909000 | 0.883600 | 0.855000 |
std | 0.039682 | 0.035024 | 0.037725 | 0.052631 | 0.026197 | 0.031741 |
cv_summary.T
min | max | mean | std | |
---|---|---|---|---|
Precision_Class_0 | 0.862 | 0.966 | 0.9292 | 0.039682 |
Precision_Class_1 | 0.780 | 0.863 | 0.8088 | 0.035024 |
Recall_Class_0 | 0.785 | 0.884 | 0.8438 | 0.037725 |
Recall_Class_1 | 0.824 | 0.958 | 0.9090 | 0.052631 |
F1_Class_0 | 0.859 | 0.923 | 0.8836 | 0.026197 |
F1_Class_1 | 0.824 | 0.908 | 0.8550 | 0.031741 |
Interpretation guidelines:
Mean Performance:
Higher mean = better overall performance
Compare to baseline or business requirements
Standard Deviation:
Low SD = consistent performance across folds (good generalization)
High SD = variable performance (potential overfitting or data issues)
Range and Min/Max:
Small range = consistent across different data subsets
Large range = sensitive to specific data characteristics
Hyperparameter Tuning
Hyperparameters:
Set before training begins
Configure model architecture/behavior
Not learned; require manual tuning
Core hyperparameters in
LogisticRegression
:Penalty (loss): ‘l1’, ‘l2’, ‘elasticnet’, None (Default: ‘l2’)
Regularization strength: smaller values mean stronger regularization (Default: 1.0)
Solver (Algorithm to use for optimization):
newton-cg
,lbfgs
,liblinear
,sag
,saga
(Default:lbfgs
)
Core Hyperparameters can
Directly influence model’s learning process and capabilities
Control model complexity, learning behavior, and prevention of overfitting
Changes significantly impact accuracy, generalization, and prediction quality
Additional notes (regarding parameters used in following cell):
C
(Regularization Strength):Controls how much to penalize complex models
Smaller values = more regularization (simpler model)
Loss function (penalty):
l2
: Ridge regularization (shrinks coefficients toward zero)l1
: Lasso regularization (can set coefficients exactly to zero, good for feature selection)
solver:
liblinear
: Good for small datasets, works with both L1 and L2saga
: Good for large datasets, works with both L1 and L2ElasticNet
: CombinesL1
andL2
benefits (address multicollinearity handle groups of correlated genes)
max_iter:
Maximum number of iterations for the algorithm to converge
Increase if you get convergence warnings
# Get the features and target variable
X_train, X_test, y_train, y_test = train_test_split(
gliomas.drop("Grade", axis=1),
gliomas["Grade"],
test_size=0.3,
random_state=42,
stratify=gliomas["Grade"],
)
# Create the base model
lr = LogisticRegression(random_state=42, max_iter=10000)
# Define the parameter grid
# Define a VALID parameter grid
param_grid = [
# For l2 penalty (works with all solvers)
{
'penalty': ['l2'],
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'solver': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']
},
# For l1 penalty (only works with liblinear and saga)
{
'penalty': ['l1'],
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'solver': ['liblinear', 'saga']
},
# For elasticnet penalty (only works with saga)
# Note: l1_ratio must be between 0 and 1
{
'penalty': ['elasticnet'],
'l1_ratio': [0.1, 0.5, 0.9],
'solver': ['saga'],
'C': [0.1, 1, 10]
}
]
# Create GridSearchCV object
grid_search = GridSearchCV(
estimator=lr,
param_grid=param_grid,
cv=5, # 5-fold cross-validation
scoring='f1_macro', # Assessment metric
verbose=1, # Print progress
n_jobs=-1 # Use all CPU cores
)
# Fit the grid search
grid_search.fit(X_train, y_train)
# Access results for different metrics
print(f"Best parameters (based on f1):", grid_search.best_params_)
print(f"Best f1 score:", grid_search.best_score_)
# Make predictions with the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
# Evaluate the best model
print("\nTest set accuracy:", best_model.score(X_test, y_test))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Fitting 5 folds for each of 51 candidates, totalling 255 fits
Best parameters (based on f1): {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best f1 score: 0.8708974463864253
Test set accuracy: 0.8690476190476191
Classification Report:
precision recall f1-score support
0.0 0.92 0.85 0.88 146
1.0 0.81 0.90 0.85 106
accuracy 0.87 252
macro avg 0.87 0.87 0.87 252
weighted avg 0.87 0.87 0.87 252
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plot confusion matrix as heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()
