%pip install -q -r requirements.txt lightgbm

Note: you may need to restart the kernel to use updated packages.

# suppress warnings for cleaner notebook output
import warnings
warnings.filterwarnings("ignore")

# force-reload src.modeling modules so code changes are always picked up
%load_ext autoreload
%autoreload 2

import json
import gc
import time
from pathlib import Path
from typing import Any, Dict, List, Optional
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from IPython.display import display
from src.modeling.config import (
    FEATURES_ARTIFACTS_DIRECTORY,
    MODEL_TRAINING_ARTIFACTS_DIRECTORY,
    API_MODEL_RESULTS_DIRECTORY,
    N_CLASSES,
    NOVA_LABELS,
    PRIMARY_METRIC,
    RANDOM_STATE
)
from src.modeling import (
    MetricsEvaluator,
    ModelPlotter,
    ModelResult,
    ModelRunner,
)

import shutil 

np.random.seed(RANDOM_STATE)

# load saved splits and metadata from Feature Engineering notebook (Notebook 02)
splits = joblib.load(FEATURES_ARTIFACTS_DIRECTORY / "data_splits.joblib")

# unpack train/validation/test splits (unscaled and scaled)
X_train = splits["X_train"]
X_val = splits["X_val"]
X_test = splits["X_test"]
y_train = splits["y_train"]
y_val = splits["y_val"]
y_test = splits["y_test"]

# scaled versions needed for the MLP neural network
X_train_scaled = splits["X_train_scaled"]
X_val_scaled = splits["X_val_scaled"]
X_test_scaled = splits["X_test_scaled"]

# feature names and class weights computed during feature engineering
feature_names = splits["feature_names"]
class_weights_dict = splits["class_weights"]

# load the fitted StandardScaler for potential downstream use
scaler = joblib.load(FEATURES_ARTIFACTS_DIRECTORY / "standard_scaler.joblib")

# load feature engineering metadata for validation
with open(FEATURES_ARTIFACTS_DIRECTORY / "feature_engineering_metadata.json", "r") as f:
    fe_metadata = json.load(f)

# validate feature consistency between saved splits and metadata
expected_feature_count = len(fe_metadata["original_features"]) + len(fe_metadata["engineered_features"])
assert len(feature_names) == expected_feature_count, (
    f"Feature count mismatch: expected {expected_feature_count}, got {len(feature_names)}"
)

print("Data Loading Summary:")
print(f"  Features loaded: {len(feature_names)}")
print(f"  Original features: {len(fe_metadata['original_features'])}")
print(f"  Engineered features: {len(fe_metadata['engineered_features'])}")
print(f"  Features removed in EDA: {fe_metadata['features_removed_in_eda']}")
print(f"\nFeature names: {feature_names}")
print(f"\nData shapes:")
print(f"  Train: {X_train.shape}  Val: {X_val.shape}  Test: {X_test.shape}")
print(f"\nClass weights: {class_weights_dict}")

Data Loading Summary:
  Features loaded: 13
  Original features: 10
  Engineered features: 3
  Features removed in EDA: ['sodium_100g', 'trans_fat_100g', 'monounsaturated_fat_100g', 'polyunsaturated_fat_100g', 'starch_100g']

Feature names: ['energy_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'saturated_fat_100g', 'additives_n', 'added_sugars_100g', 'sugar_fiber_ratio', 'fat_protein_ratio', 'additives_per_energy']

Data shapes:
  Train: (184159, 13)  Val: (61386, 13)  Test: (61387, 13)

Class weights: {0: 2.0718490117723296, 1: 5.653355927208429, 2: 1.2284552455053392, 3: 0.3958165686577943}

# verify that class distributions are consistent across all three splits
# stratified splitting should produce similar NOVA group proportions
for name, y in [("Train", y_train), ("Val", y_val), ("Test", y_test)]:
    counts = y.value_counts().sort_index()
    pcts = (counts / len(y) * 100).round(1)
    dist = ", ".join(f"NOVA {i+1}: {c:,} ({pcts[i]}%)" for i, c in counts.items())
    print(f"{name:5s} ({len(y):,}) -- {dist}")

Train (184,159) -- NOVA 1: 22,221 (12.1%), NOVA 2: 8,144 (4.4%), NOVA 3: 37,478 (20.4%), NOVA 4: 116,316 (63.2%)
Val   (61,386) -- NOVA 1: 7,407 (12.1%), NOVA 2: 2,715 (4.4%), NOVA 3: 12,492 (20.3%), NOVA 4: 38,772 (63.2%)
Test  (61,387) -- NOVA 1: 7,408 (12.1%), NOVA 2: 2,714 (4.4%), NOVA 3: 12,493 (20.4%), NOVA 4: 38,772 (63.2%)

# compute sklearn-style balanced class weights (inverse frequency)
# this gives higher weight to underrepresented classes (e.g., NOVA 2)
classes = np.array(sorted(y_train.unique()))
sklearn_class_weights = compute_class_weight("balanced", classes=classes, y=y_train)
sklearn_cw_dict = dict(zip(classes, sklearn_class_weights))

# per-sample weights for train set (used by XGBoost and LightGBM via sample_weight)
train_sample_weights = compute_sample_weight("balanced", y_train)

print("Balanced class weights (sklearn):")
for cls, w in sklearn_cw_dict.items():
    print(f"  NOVA {cls + 1}: {w:.4f}")

Balanced class weights (sklearn):
  NOVA 1: 2.0719
  NOVA 2: 5.6532
  NOVA 3: 1.2284
  NOVA 4: 0.3958

# instantiate shared evaluation objects used by all four models
# MetricsEvaluator: computes accuracy, F1, precision, recall, log loss, ROC-AUC, etc.
# ModelRunner: wraps fit/predict with timing and calls MetricsEvaluator
# ModelPlotter: generates confusion matrices, bar charts, heatmaps, etc.
evaluator = MetricsEvaluator(labels=NOVA_LABELS, n_classes=N_CLASSES)
runner = ModelRunner(evaluator=evaluator)
plotter = ModelPlotter()

# dictionary to collect ModelResult objects from each trained model
all_results: Dict[str, ModelResult] = {}

# Random Forest: tree-based, uses unscaled data (invariant to feature scaling)
# class_weight="balanced" handles the NOVA 4 dominance directly in the loss function
rf_model = RandomForestClassifier(
    n_estimators=400,         # number of trees in the ensemble
    max_depth=25,             # limit tree depth to control overfitting
    min_samples_split=5,      # minimum samples to split an internal node
    min_samples_leaf=2,       # minimum samples at a leaf node
    max_features="sqrt",      # random subset of features per split for diversity
    class_weight="balanced",  # auto-adjust weights inversely proportional to class frequency
    n_jobs=-1,                # use all CPU cores for parallel training
    random_state=RANDOM_STATE,
)

# train and evaluate using the shared ModelRunner pipeline
rf_result = runner.run(
    name="Random Forest",
    model=rf_model,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
)
all_results[rf_result.name] = rf_result

print(f"\nRandom Forest Model Results:")
print(f"  Macro F1: {rf_result.metrics['Macro F1']:.4f}")
print(f"  Balanced Accuracy: {rf_result.metrics['Balanced Accuracy']:.4f}")
print(f"  Train Time: {rf_result.metrics['Train Time (s)']:.1f}s")
display(rf_result.per_class)

Random Forest Model Results:
  Macro F1: 0.8637
  Balanced Accuracy: 0.8822
  Train Time: 11.7s

# confusion matrix and per-class F1 bar chart for Random Forest
plotter.plot_confusion_matrix(rf_result.confusion, NOVA_LABELS, "Random Forest -- Confusion Matrix (Validation)")
plotter.plot_per_class_f1(rf_result, NOVA_LABELS)

# XGBoost: gradient-boosted trees, uses unscaled data (tree-based, scale-invariant)
# sample_weight handles class imbalance (XGBoost does not support class_weight directly)
xgb_model = XGBClassifier(
    objective="multi:softprob",  # multiclass with probability outputs (needed for log loss)
    num_class=N_CLASSES,
    eval_metric="mlogloss",      # multiclass log loss for early stopping monitoring
    n_estimators=500,            # boosting rounds
    max_depth=8,                 # tree depth — deeper than RF due to sequential learning
    learning_rate=0.1,           # step size shrinkage to prevent overfitting
    subsample=0.8,               # row subsampling per tree for regularization
    colsample_bytree=0.8,        # column subsampling per tree for regularization
    reg_alpha=0.1,               # L1 regularization on leaf weights
    reg_lambda=1.0,              # L2 regularization on leaf weights
    tree_method="hist",          # histogram-based splitting for faster training
    random_state=RANDOM_STATE,
    verbosity=0,                 # suppress XGBoost warnings
)

# train with sample weights and validation set for monitoring
xgb_result = runner.run(
    name="XGBoost",
    model=xgb_model,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    fit_kwargs={
        "sample_weight": train_sample_weights,
        "eval_set": [(X_val, y_val)],
        "verbose": False,
    },
)
all_results[xgb_result.name] = xgb_result

print(f"\nXGBoost Model Results:")
print(f"  Macro F1: {xgb_result.metrics['Macro F1']:.4f}")
print(f"  Balanced Accuracy: {xgb_result.metrics['Balanced Accuracy']:.4f}")
print(f"  Train Time: {xgb_result.metrics['Train Time (s)']:.1f}s")
display(xgb_result.per_class)

XGBoost Model Results:
  Macro F1: 0.8601
  Balanced Accuracy: 0.8897
  Train Time: 13.4s

# confusion matrix and per-class F1 bar chart for XGBoost
plotter.plot_confusion_matrix(xgb_result.confusion, NOVA_LABELS, "XGBoost -- Confusion Matrix (Validation)")
plotter.plot_per_class_f1(xgb_result, NOVA_LABELS)

# MLP: feedforward neural network — requires scaled features (distance-based model)
# three hidden layers with decreasing width to progressively compress representations
mlp_model = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),  # 3-layer architecture: wide → narrow
    activation="relu",                   # ReLU avoids vanishing gradient problem
    solver="adam",                       # Adam optimizer with adaptive learning rates
    learning_rate="adaptive",            # reduce learning rate when loss plateaus
    learning_rate_init=1e-3,             # initial learning rate
    max_iter=200,                        # maximum training epochs
    early_stopping=True,                 # stop when validation loss stops improving
    validation_fraction=0.15,            # 15% of training data held out for early stopping
    n_iter_no_change=15,                 # patience: epochs without improvement before stopping
    batch_size=512,                      # mini-batch size for stochastic gradient descent
    random_state=RANDOM_STATE,
    verbose=False,
)

# train on SCALED data (MLP is sensitive to feature magnitude)
mlp_result = runner.run(
    name="MLP",
    model=mlp_model,
    X_train=X_train_scaled,
    y_train=y_train,
    X_val=X_val_scaled,
    y_val=y_val,
)
all_results[mlp_result.name] = mlp_result

print(f"\nMLP Model Results:")
print(f"  Macro F1: {mlp_result.metrics['Macro F1']:.4f}")
print(f"  Balanced Accuracy: {mlp_result.metrics['Balanced Accuracy']:.4f}")
print(f"  Train Time: {mlp_result.metrics['Train Time (s)']:.1f}s")
print(f"  Iterations: {mlp_model.n_iter_}")
display(mlp_result.per_class)

MLP Model Results:
  Macro F1: 0.8373
  Balanced Accuracy: 0.8452
  Train Time: 215.4s
  Iterations: 85

# training loss curve shows convergence behavior and early stopping point
plotter.plot_mlp_loss_curve(mlp_model.loss_curve_)

# confusion matrix and per-class F1 bar chart for MLP
plotter.plot_confusion_matrix(mlp_result.confusion, NOVA_LABELS, "MLP -- Confusion Matrix (Validation)")
plotter.plot_per_class_f1(mlp_result, NOVA_LABELS)

# free memory after MLP training
gc.collect()

30512

# LightGBM callbacks for logging and early stopping
from lightgbm import log_evaluation, early_stopping

# LightGBM: histogram-based gradient boosting, uses unscaled data (tree-based)
# class_weight="balanced" handles imbalance natively within the loss function
lgbm_model = LGBMClassifier(
    objective="multiclass",
    num_class=N_CLASSES,
    metric="multi_logloss",       # validation metric for early stopping
    n_estimators=500,             # maximum boosting rounds
    max_depth=10,                 # slightly deeper trees than XGBoost
    learning_rate=0.1,            # step size shrinkage
    num_leaves=63,                # max leaves per tree (LightGBM-specific, leaf-wise growth)
    subsample=0.8,                # row subsampling for regularization
    colsample_bytree=0.8,         # column subsampling for regularization
    reg_alpha=0.1,                # L1 regularization
    reg_lambda=1.0,               # L2 regularization
    class_weight="balanced",      # auto-adjust weights inversely proportional to class frequency
    n_jobs=-1,                    # use all CPU cores
    random_state=RANDOM_STATE,
    verbose=-1,                   # suppress LightGBM training output
)

# train with early stopping on validation multiclass log loss
lgbm_result = runner.run(
    name="LightGBM",
    model=lgbm_model,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    fit_kwargs={
        "eval_set": [(X_val, y_val)],
        "callbacks": [log_evaluation(period=-1), early_stopping(stopping_rounds=50)],
    },
)
all_results[lgbm_result.name] = lgbm_result

print(f"\nLightGBM Model Results:")
print(f"  Macro F1: {lgbm_result.metrics['Macro F1']:.4f}")
print(f"  Balanced Accuracy: {lgbm_result.metrics['Balanced Accuracy']:.4f}")
print(f"  Train Time: {lgbm_result.metrics['Train Time (s)']:.1f}s")
display(lgbm_result.per_class)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's multi_logloss: 0.33919

LightGBM Model Results:
  Macro F1: 0.8578
  Balanced Accuracy: 0.8895
  Train Time: 13.4s

# confusion matrix and per-class F1 bar chart for LightGBM
plotter.plot_confusion_matrix(lgbm_result.confusion, NOVA_LABELS, "LightGBM -- Confusion Matrix (Validation)")
plotter.plot_per_class_f1(lgbm_result, NOVA_LABELS)

# build summary dataframe from all collected ModelResult objects
summary_rows = []
for name, res in all_results.items():
    row = {"Model": name}
    row.update(res.metrics)
    summary_rows.append(row)

# sort by primary metric (Macro F1) in descending order — best model first
summary_df = pd.DataFrame(summary_rows).sort_values(PRIMARY_METRIC, ascending=False)
summary_df = summary_df.reset_index(drop=True)

# styled display: highlight best values in green for key metrics
print("Model Comparison Summary (sorted by Macro F1)")
display(summary_df.style.format({
    c: "{:.4f}" for c in summary_df.columns if c != "Model"
}).highlight_max(
    subset=[PRIMARY_METRIC, "Balanced Accuracy", "Weighted F1"],
    color="#f7376d"
))

Model Comparison Summary (sorted by Macro F1)

# grouped bar chart of key metrics
plotter.plot_comparison_bars(
    summary_df,
    metrics=["Macro F1", "Balanced Accuracy", "Weighted F1", "Macro Precision", "Macro Recall"],
    title="Model Comparison -- Key Metrics (Validation Set)",
)

# individual metric comparisons and training time efficiency
plotter.plot_metric_comparison(summary_df, "Macro F1", "Macro F1 Comparison")
plotter.plot_metric_comparison(summary_df, "Balanced Accuracy", "Balanced Accuracy Comparison")
plotter.plot_training_time(summary_df)

# heatmap of per-class F1 across all models — reveals which classes each model struggles with
plotter.plot_per_class_f1_heatmap(all_results, NOVA_LABELS)

# select the best model — summary_df is already sorted by Macro F1 (descending)
# The dataset is imbalanced (NOVA 4 heavy), so we prioritize balanced accuracy to ensure fair performance across all NOVA classes
best_row = summary_df.sort_values("Balanced Accuracy", ascending=False).iloc[0]
best_model_name = best_row["Model"]
best_result = all_results[best_model_name]

# print selection summary
print(f"Selected model: {best_model_name}")
print(f"  Macro F1:          {best_row['Macro F1']:.4f}")
print(f"  Balanced Accuracy: {best_row['Balanced Accuracy']:.4f}")
print(f"  Weighted F1:       {best_row['Weighted F1']:.4f}")
if "Log Loss" in best_row:
    print(f"  Log Loss:          {best_row['Log Loss']:.4f}")
print(f"  Train Time:        {best_row['Train Time (s)']:.1f}s")

Selected model: XGBoost
  Macro F1:          0.8601
  Balanced Accuracy: 0.8897
  Weighted F1:       0.8736
  Log Loss:          0.3310
  Train Time:        13.4s

# minority class analysis — identify the weakest class for targeted tuning
print("Per-class F1 for selected model:")
display(best_result.per_class)

# find the class with the lowest F1 score (most room for improvement)
min_f1_class = best_result.per_class.loc[best_result.per_class["F1"].idxmin()]
print(f"\nWeakest class: {min_f1_class['Class']} (F1={min_f1_class['F1']:.4f})")
print("This class should receive focused attention during hyperparameter tuning.")

Per-class F1 for selected model:

Weakest class: NOVA 3 (F1=0.7600)
This class should receive focused attention during hyperparameter tuning.

# create output directory if it doesn't exist
if MODEL_TRAINING_ARTIFACTS_DIRECTORY.exists():
    shutil.rmtree(MODEL_TRAINING_ARTIFACTS_DIRECTORY)

MODEL_TRAINING_ARTIFACTS_DIRECTORY.mkdir(parents=True, exist_ok=True)

# save comparison summary as CSV for easy reference and downstream analysis
summary_path = MODEL_TRAINING_ARTIFACTS_DIRECTORY / "model_comparison_summary.csv"
summary_df.to_csv(summary_path, index=False)
print(f"Saved: {summary_path}")

# save selected model metadata as JSON for the hyperparameter tuning notebook
selection_meta = {
    "selected_model": best_model_name,
    "primary_metric": PRIMARY_METRIC,
    "primary_metric_value": round(float(best_row[PRIMARY_METRIC]), 4),
    "balanced_accuracy": round(float(best_row["Balanced Accuracy"]), 4),
    "weighted_f1": round(float(best_row["Weighted F1"]), 4),
    "all_models": list(all_results.keys()),
    "feature_count": len(feature_names),
    "n_classes": N_CLASSES,
    "original_features": fe_metadata["original_features"],
    "engineered_features": fe_metadata["engineered_features"],
    "features_removed_in_eda": fe_metadata["features_removed_in_eda"],
}
meta_path = MODEL_TRAINING_ARTIFACTS_DIRECTORY / "selected_model_meta.json"
with open(meta_path, "w") as f:
    json.dump(selection_meta, f, indent=2)
print(f"Saved: {meta_path}")

# save per-class metrics for all models (useful for detailed analysis and reporting)
per_class_path = MODEL_TRAINING_ARTIFACTS_DIRECTORY / "per_class_metrics.csv"
per_class_all = []
for name, res in all_results.items():
    if res.per_class is not None:
        df_pc = res.per_class.copy()
        df_pc.insert(0, "Model", name)
        per_class_all.append(df_pc)
if per_class_all:
    pd.concat(per_class_all, ignore_index=True).to_csv(per_class_path, index=False)
    print(f"Saved: {per_class_path}")

Saved: results\training\model_comparison_summary.csv
Saved: results\training\selected_model_meta.json
Saved: results\training\per_class_metrics.csv

	Class	Precision	Recall	F1	Support
0	NOVA 1	0.810229	0.915350	0.859588	7407.0
1	NOVA 2	0.894214	0.927808	0.910701	2715.0
2	NOVA 3	0.744270	0.787704	0.765372	12492.0
3	NOVA 4	0.941509	0.897993	0.919236	38772.0

	Class	Precision	Recall	F1	Support
0	NOVA 1	0.800869	0.920346	0.856461	7407.0
1	NOVA 2	0.887114	0.940700	0.913121	2715.0
2	NOVA 3	0.702131	0.828370	0.760044	12492.0
3	NOVA 4	0.956094	0.869416	0.910697	38772.0

	Class	Precision	Recall	F1	Support
0	NOVA 1	0.792116	0.892534	0.839332	7407.0
1	NOVA 2	0.885984	0.898711	0.892302	2715.0
2	NOVA 3	0.740499	0.681636	0.709850	12492.0
3	NOVA 4	0.907649	0.908001	0.907825	38772.0

	Class	Precision	Recall	F1	Support
0	NOVA 1	0.795835	0.923586	0.854965	7407.0
1	NOVA 2	0.889739	0.942173	0.915206	2715.0
2	NOVA 3	0.690681	0.831172	0.754441	12492.0
3	NOVA 4	0.957285	0.861240	0.906726	38772.0

	Model	Accuracy	Balanced Accuracy	Macro Precision	Macro Recall	Macro F1	Weighted Precision	Weighted Recall	Weighted F1	Log Loss	ROC-AUC (OVR)	Train Time (s)	Inference Time (s)
0	Random Forest	0.8790	0.8822	0.8476	0.8822	0.8637	0.8834	0.8790	0.8804	0.3176	0.9744	11.6900	0.4143
1	XGBoost	0.8704	0.8897	0.8366	0.8897	0.8601	0.8826	0.8704	0.8736	0.3310	0.9741	13.4400	0.2949
2	LightGBM	0.8662	0.8895	0.8334	0.8895	0.8578	0.8806	0.8662	0.8699	0.3392	0.9733	13.4200	1.2026
3	MLP	0.8597	0.8452	0.8316	0.8452	0.8373	0.8587	0.8597	0.8586	0.3489	0.9662	215.3700	0.1129

Context Aware Nutritional Assessment¶

Predicting Food Processing Tiers through Machine Learning¶

Notebook: 03 - Model Training and Evaluation¶

AAI-590 Capstone Project - University of San Diego¶

Team Members:¶

Objective¶

Environment Setup and Configuration¶

Load Prepared Data¶

Class Imbalance Strategy¶

Model Pipeline¶

Random Forest Model¶

Interpretation¶

XGBoost Model¶

Interpretation¶

MLP Neural Network Model¶

Interpretation¶

LightGBM Model¶

Interpretation¶

Comparative Performance Analysis¶

Interpretation¶

Final Model Selection¶

Selection Rationale¶

Export Results¶

Summary¶

AI Use Disclosure¶