%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.

# suppress warnings for cleaner notebook output
import warnings
warnings.filterwarnings("ignore")

# force-reload src modules so code changes are always picked up
%load_ext autoreload
%autoreload 2

import json
import time
from pathlib import Path
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from IPython.display import display

from src.modeling import (
    FEATURES_ARTIFACTS_DIRECTORY,
    NOVA_LABELS,
    RANDOM_STATE,
    MODEL_TRAINING_ARTIFACTS_DIRECTORY,
    MODEL_TUNING_ARTIFACTS_DIRECTORY,
    API_MODEL_RESULTS_DIRECTORY,
    ANOMALY_DETECTION_ARTIFACTS_DIRECTORY
)
from src.anomaly import (
    NOVA_COLORS,
    MODEL_COLORS,
    AnomalyTrainer,
    AnomalyScorer,
    AnomalyPlotter,
    predict_nova_and_anomaly,
)

np.random.seed(RANDOM_STATE)
plt.rcParams.update({"figure.dpi": 100, "font.size": 11})

# shared instances used throughout the notebook
trainer = AnomalyTrainer(random_state=RANDOM_STATE)
scorer  = AnomalyScorer(ae_percentile=95)
plotter = AnomalyPlotter()

print("Environment ready.")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Environment ready.

# load saved splits and metadata from Feature Engineering notebook (Notebook 02)
splits = joblib.load(FEATURES_ARTIFACTS_DIRECTORY / "data_splits.joblib")

# unscaled splits (used for feature inspection and the final inference function)
X_train = splits["X_train"]
X_val   = splits["X_val"]
X_test  = splits["X_test"]
y_train = splits["y_train"]
y_val   = splits["y_val"]
y_test  = splits["y_test"]

# scaled versions — used for all three anomaly detection models
X_train_scaled = splits["X_train_scaled"]
X_val_scaled   = splits["X_val_scaled"]
X_test_scaled  = splits["X_test_scaled"]

# feature names and the fitted scaler
feature_names = splits["feature_names"]
scaler        = joblib.load(FEATURES_ARTIFACTS_DIRECTORY / "standard_scaler.joblib")

# load feature engineering metadata to validate consistency
with open(FEATURES_ARTIFACTS_DIRECTORY / "feature_engineering_metadata.json") as f:
    fe_meta = json.load(f)

# validate feature consistency
expected_num_features = len(fe_meta["original_features"]) + len(fe_meta["engineered_features"])
assert len(feature_names) == expected_num_features, \
    f"Feature count mismatch: expected {expected_num_features}, got {len(feature_names)}"

print("Feature Engineering Metadata Validation")
print(f"Original features: {len(fe_meta['original_features'])}")
print(f"Engineered features: {len(fe_meta['engineered_features'])}")
print(f"Total features: {len(feature_names)}")
print(f"Removed from EDA: {', '.join(fe_meta['features_removed_in_eda'])}")

# combine train + val scaled data to give anomaly models more NOVA 1 examples
X_trainval_scaled = np.vstack([X_train_scaled, X_val_scaled])
y_trainval        = pd.concat([y_train, y_val], ignore_index=True)

# verify y values: nova labels are 0-indexed (0=NOVA1, 1=NOVA2, 2=NOVA3, 3=NOVA4)
print(f"\ny unique values: {sorted(y_test.unique())}  →  NOVA groups 1-4 (0-indexed)")
print(f"\nFeatures ({len(feature_names)}):")
for feat in feature_names:
    print(f"  - {feat}")
print(f"\nData Splits:")
print(f"  Train: {X_train.shape}   Val: {X_val.shape}   Test: {X_test.shape}")

Feature Engineering Metadata Validation
Original features: 10
Engineered features: 3
Total features: 13
Removed from EDA: sodium_100g, trans_fat_100g, monounsaturated_fat_100g, polyunsaturated_fat_100g, starch_100g

y unique values: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]  →  NOVA groups 1-4 (0-indexed)

Features (13):
  - energy_100g
  - fat_100g
  - carbohydrates_100g
  - sugars_100g
  - fiber_100g
  - proteins_100g
  - salt_100g
  - saturated_fat_100g
  - additives_n
  - added_sugars_100g
  - sugar_fiber_ratio
  - fat_protein_ratio
  - additives_per_energy

Data Splits:
  Train: (184159, 13)   Val: (61386, 13)   Test: (61387, 13)

# Extract NOVA 1 (whole-food) training data
# All three anomaly models train ONLY on NOVA 1 samples.
# y == 0 corresponds to NOVA 1 (unprocessed foods) in the 0-indexed encoding.

nova1_mask_trainval = (y_trainval == 0)
nova1_mask_test     = (y_test == 0)

X_nova1_train = X_trainval_scaled[nova1_mask_trainval.values]  # training baseline
X_nova1_test  = X_test_scaled[nova1_mask_test.values]          # NOVA 1 test portion

print(f"NOVA 1 training samples (train + val combined): {X_nova1_train.shape[0]:,}")
print(f"NOVA 1 test samples:                            {X_nova1_test.shape[0]:,}")
print()

# split test set by NOVA group for per-group evaluation
for g in range(4):
    mask  = (y_test.values == g)
    count = mask.sum()
    pct   = count / len(y_test) * 100
    print(f"  NOVA {g+1} test: {count:,} samples ({pct:.1f}%)")

NOVA 1 training samples (train + val combined): 29,628
NOVA 1 test samples:                            7,408

  NOVA 1 test: 7,408 samples (12.1%)
  NOVA 2 test: 2,714 samples (4.4%)
  NOVA 3 test: 12,493 samples (20.4%)
  NOVA 4 test: 38,772 samples (63.2%)

# Extract NOVA 1 (whole-food) training data
# All three anomaly models train ONLY on NOVA 1 samples.
# y == 0 corresponds to NOVA 1 (unprocessed foods) in the 0-indexed encoding.

nova1_mask_trainval = (y_trainval == 0)
nova1_mask_test     = (y_test == 0)

X_nova1_train = X_trainval_scaled[nova1_mask_trainval.values]  # training baseline
X_nova1_test  = X_test_scaled[nova1_mask_test.values]          # NOVA 1 test portion

print(f"NOVA 1 training samples (train + val combined): {X_nova1_train.shape[0]:,}")
print(f"NOVA 1 test samples:                            {X_nova1_test.shape[0]:,}")
print()

# split test set by NOVA group for per-group evaluation
for g in range(4):
    mask  = (y_test.values == g)
    count = mask.sum()
    pct   = count / len(y_test) * 100
    print(f"  NOVA {g+1} test: {count:,} samples ({pct:.1f}%)")

NOVA 1 training samples (train + val combined): 29,628
NOVA 1 test samples:                            7,408

  NOVA 1 test: 7,408 samples (12.1%)
  NOVA 2 test: 2,714 samples (4.4%)
  NOVA 3 test: 12,493 samples (20.4%)
  NOVA 4 test: 38,772 samples (63.2%)

plotter.plot_feature_distributions(X_test_scaled, y_test.values, feature_names)

autoencoder = trainer.train_autoencoder(X_nova1_train)

Training Autoencoder on NOVA 1 samples …
Done in 6.4s  |  iterations: 80  |  best val loss: 0.961703

plotter.plot_loss_curve(autoencoder)

# fit threshold on NOVA 1 training errors, then score the full test set
ae_threshold               = scorer.fit_ae_threshold(autoencoder, X_nova1_train)
ae_scores_test, ae_anomalies = scorer.score_autoencoder(autoencoder, X_test_scaled)

Autoencoder anomaly threshold (95th pct of NOVA 1 train errors): 0.003760
Autoencoder — anomalies flagged: 43,988 / 61,387  (71.7%)

plotter.plot_score_violin_and_rate(
    scores          = ae_scores_test,
    anomalies       = ae_anomalies,
    y_test_0indexed = y_test.values,
    title_prefix    = "Autoencoder",
    score_ylabel    = "Reconstruction Error (MSE)",
    threshold       = ae_threshold,
)

plotter.plot_per_feature_reconstruction_error(
    autoencoder     = autoencoder,
    X_test_scaled   = X_test_scaled,
    y_test_0indexed = y_test.values,
    feature_names   = feature_names,
)

Top 5 features with highest reconstruction error for NOVA 4:
additives_n           2.084405
added_sugars_100g     0.258714
carbohydrates_100g    0.127027
fat_100g              0.109643
saturated_fat_100g    0.078277

iso_forest = trainer.train_isolation_forest(X_nova1_train)

Training Isolation Forest on NOVA 1 samples …
Done in 0.9s  |  n_estimators: 300  |  max_samples per tree: 256

if_scores_test, if_anomalies = scorer.score_isolation_forest(iso_forest, X_test_scaled)

Isolation Forest — anomalies flagged: 36,563 / 61,387  (59.6%)

plotter.plot_score_violin_and_rate(
    scores          = if_scores_test,
    anomalies       = if_anomalies,
    y_test_0indexed = y_test.values,
    title_prefix    = "Isolation Forest",
    score_ylabel    = "Anomaly Score (decision_function)",
)

oc_svm = trainer.train_one_class_svm(X_nova1_train)

Training One-Class SVM on NOVA 1 samples …
Done in 1.5s  |  kernel: rbf  |  nu: 0.05  |  gamma: scale  |  support vectors: 1,495

svm_scores_test, svm_anomalies = scorer.score_one_class_svm(oc_svm, X_test_scaled)

One-Class SVM — anomalies flagged: 25,554 / 61,387  (41.6%)

plotter.plot_score_violin_and_rate(
    scores          = svm_scores_test,
    anomalies       = svm_anomalies,
    y_test_0indexed = y_test.values,
    title_prefix    = "One-Class SVM",
    score_ylabel    = "Anomaly Score (−decision_function)",
)

flags_df = scorer.build_ensemble(
    ae_anomalies    = ae_anomalies,
    if_anomalies    = if_anomalies,
    svm_anomalies   = svm_anomalies,
    ae_scores       = ae_scores_test,
    if_scores       = if_scores_test,
    svm_scores      = svm_scores_test,
    y_test_0indexed = y_test.values,
)

Ensemble anomaly rates by NOVA group (majority vote ≥ 2/3):
  NOVA 1:   3.7%  (275 / 7,408)
  NOVA 2:  90.6%  (2,458 / 2,714)
  NOVA 3:  35.6%  (4,442 / 12,493)
  NOVA 4:  76.5%  (29,653 / 38,772)

Overall ensemble anomaly rate: 60.0%

plotter.plot_cross_model_anomaly_rates(
    all_flags       = [ae_anomalies, if_anomalies, svm_anomalies,
                       flags_df["ensemble_anomaly"].values],
    y_test_0indexed = y_test.values,
)

plotter.plot_jaccard_heatmap(
    binary_flags = [ae_anomalies, if_anomalies, svm_anomalies],
)

  Autoencoder ∩ Isolation Forest: 34,807 samples flagged by both
  Autoencoder ∩ One-Class SVM: 23,982 samples flagged by both
  Isolation Forest ∩ One-Class SVM: 23,999 samples flagged by both

plotter.plot_ensemble_score_distribution(flags_df, y_test.values)

# Annotated test DataFrame
X_test_df = pd.DataFrame(X_test_scaled, columns=feature_names)
X_test_df["nova_group"]       = y_test.values + 1
X_test_df["ensemble_anomaly"] = flags_df["ensemble_anomaly"].values
X_test_df["ensemble_score"]   = flags_df["ensemble_score"].values
X_test_df["ae_score"]         = ae_scores_test
X_test_df["if_score"]         = if_scores_test
X_test_df["svm_score"]        = svm_scores_test
X_test_df["votes"]            = flags_df["votes"].values

# top-20 anomalous products by ensemble score (all 3 models must agree → votes == 3)
top_20 = X_test_df.sort_values("ensemble_score", ascending=False).head(20)

print("Top 20 most anomalous products — NOVA group distribution:")
print(top_20["nova_group"].value_counts().sort_index().rename("count").to_frame().to_string())
print()
print("Top 10 anomalies (ensemble score + model votes):")
display(top_20[["nova_group", "ensemble_score", "votes", "ae_score", "if_score", "svm_score"]].head(10).round(4))

Top 20 most anomalous products — NOVA group distribution:
            count
nova_group       
3               1
4              19

Top 10 anomalies (ensemble score + model votes):

plotter.plot_feature_profile(X_test_df, top_20, feature_names)

Top 5 features by absolute deviation (Anomaly − NOVA 1):

# Load the trained NOVA classifier (XGBoost from Notebook 04)
nova_classifier = XGBClassifier()
nova_classifier.load_model(FEATURES_ARTIFACTS_DIRECTORY / "xgb_baseline.json")
print(f"NOVA classifier loaded from {FEATURES_ARTIFACTS_DIRECTORY / 'xgb_baseline.json'}")

NOVA classifier loaded from results\features\xgb_baseline.json

# run combined inference on the full test set using the imported function
combined_df = predict_nova_and_anomaly(
    X_raw            = X_test.values,
    scaler           = scaler,
    nova_classifier  = nova_classifier,
    autoencoder      = autoencoder,
    ae_threshold     = ae_threshold,
    iso_forest       = iso_forest,
    oc_svm           = oc_svm,
)
combined_df["true_nova"] = y_test.values + 1

print("Combined inference results (first 10 rows):")
display(combined_df.head(10))

Combined inference results (first 10 rows):

# Summary table: NOVA confidence + anomaly rate by true NOVA group
summary = (
    combined_df.groupby("true_nova")
    .agg(
        n_samples         = ("is_anomalous", "count"),
        anomaly_rate_pct  = ("is_anomalous", lambda x: round(x.mean() * 100, 1)),
        mean_ens_score    = ("ensemble_score", lambda x: round(x.mean(), 4)),
        mean_nova_conf    = ("nova_confidence", lambda x: round(x.mean(), 4)),
        nova_correct_pct  = ("nova_pred",
                             lambda x: round((x == combined_df.loc[x.index, "true_nova"]).mean() * 100, 1)),
    )
    .rename(index={g: f"NOVA {g}" for g in range(1, 5)})
    .rename(columns={
        "n_samples":        "N Samples",
        "anomaly_rate_pct": "Anomaly Rate (%)",
        "mean_ens_score":   "Mean Ensemble Score",
        "mean_nova_conf":   "Mean NOVA Confidence",
        "nova_correct_pct": "NOVA Accuracy (%)",
    })
)

print("Combined Inference Summary by True NOVA Group")
display(summary)

Combined Inference Summary by True NOVA Group

plotter.plot_confidence_vs_anomaly_score(combined_df)

# export anomaly models to backend/models for API consumption
API_MODEL_RESULTS_DIRECTORY.mkdir(parents=True, exist_ok=True)

print("Exporting anomaly detection artifacts to backend/models for API...")

# save all trained anomaly models and metadata in a single joblib file
anomaly_artifact = {
    "autoencoder":      autoencoder,              # MLPRegressor auto-associative network
    "isolation_forest": iso_forest,               # IsolationForest (contamination=0.05)
    "one_class_svm":    oc_svm,                   # OneClassSVM (rbf, nu=0.05)
    "ae_threshold":     float(ae_threshold),      # 95th percentile of NOVA 1 train errors
    "ae_train_errors":  scorer.ae_train_errors,   # NOVA 1 training reconstruction errors
    "feature_names":    feature_names,            # column order for inference
    "nova1_train_size": int(X_nova1_train.shape[0]),
    "features": {
        "total_count": len(feature_names),
        "original_features": fe_meta["original_features"],
        "engineered_features": fe_meta["engineered_features"],
        "removed_features": fe_meta["features_removed_in_eda"],
    },
}
artifact_path = API_MODEL_RESULTS_DIRECTORY / "anomaly_models.joblib"
joblib.dump(anomaly_artifact, artifact_path)
print(f"Saved: {artifact_path}")

# save test-set anomaly scores as a parquet for downstream analysis
ANOMALY_DETECTION_ARTIFACTS_DIRECTORY.mkdir(parents=True, exist_ok=True)
scores_out = pd.DataFrame({
    "nova_group":       y_test.values + 1,
    "ae_score":         ae_scores_test,
    "if_score":         if_scores_test,
    "svm_score":        svm_scores_test,
    "ae_anomaly":       ae_anomalies,
    "if_anomaly":       if_anomalies,
    "svm_anomaly":      svm_anomalies,
    "ensemble_score":   flags_df["ensemble_score"].values,
    "ensemble_anomaly": flags_df["ensemble_anomaly"].values,
    "anomaly_votes":    flags_df["votes"].values,
})
scores_path = ANOMALY_DETECTION_ARTIFACTS_DIRECTORY / "anomaly_scores_test.parquet"
scores_out.to_parquet(scores_path, index=False)
print(f"Saved: {scores_path}")

Exporting anomaly detection artifacts to backend/models for API...
Saved: backend\models\anomaly_models.joblib
Saved: results\anomaly_detection\anomaly_scores_test.parquet

	nova_group	ensemble_score	votes	ae_score	if_score	svm_score
42888	4	0.9214	3	1091.6477	0.1759	106.5768
35082	4	0.6691	3	8.0377	0.2873	106.5611
29509	4	0.6642	3	1.3701	0.2832	106.5758
907	4	0.6466	3	1.5305	0.2732	101.5060
22580	4	0.6449	3	0.4943	0.2571	106.3021
42461	4	0.6412	3	0.4538	0.2520	106.2473
36211	4	0.6370	3	0.8090	0.2504	104.7195
10344	4	0.6364	3	1.0928	0.2449	106.2521
27906	4	0.6355	3	3.0689	0.2491	104.1095
10658	4	0.6331	3	2.0013	0.2432	105.1434

	Anomaly − NOVA1	NOVA4 − NOVA1
additives_n	8.515	1.113
sugar_fiber_ratio	3.494	0.288
sugars_100g	2.725	0.536
fat_100g	2.294	0.292
saturated_fat_100g	2.215	0.471

	nova_pred	nova_confidence	ae_score	if_score	svm_score	ensemble_score	anomaly_votes	is_anomalous	true_nova
0	4	0.9939	0.123682	0.005758	-12.184005	0.2196	2	True	4
1	1	0.9073	0.002661	0.035214	-6.204489	0.2529	1	False	1
2	4	0.6071	0.008547	0.026295	23.172528	0.3080	3	True	4
3	4	0.7757	0.009509	-0.026850	-40.989087	0.1363	1	False	4
4	4	0.9919	0.289508	0.138132	9.759058	0.3590	3	True	4
5	4	0.9993	0.371213	0.162320	50.123095	0.4605	3	True	4
6	3	0.9171	0.589789	0.143428	57.364467	0.4624	3	True	4
7	4	0.9993	0.357665	0.125528	25.384071	0.3828	3	True	4
8	4	0.9949	0.214717	0.032837	4.054722	0.2727	3	True	4
9	4	0.9893	0.121216	0.026472	1.032826	0.2618	3	True	4

	N Samples	Anomaly Rate (%)	Mean Ensemble Score	Mean NOVA Confidence	NOVA Accuracy (%)
true_nova
NOVA 1	7408	3.7	0.1214	0.8888	93.3
NOVA 2	2714	90.6	0.4437	0.9624	96.3
NOVA 3	12493	35.6	0.1961	0.8065	83.2
NOVA 4	38772	76.5	0.3001	0.8746	83.1

Context Aware Nutritional Assessment¶

Predicting Food Processing Tiers through Machine Learning¶

Notebook: 05 - Anomaly Detection: Identifying Nutritionally Unusual Food Products¶

AAI-590 Capstone Project - University of San Diego¶

Team Members:¶

Objective¶

Environment Setup and Configuration¶

Load Data and Artifacts¶

NOVA 1 Whole-Food Baseline Profile¶

Model 1: Autoencoder (MLPRegressor Auto-Associative Network)¶

Model 2: Isolation Forest¶

Model 3: One-Class SVM (OCSVM)¶

Ensemble: Majority Vote and Score Fusion¶

Anomalous Product Analysis¶

Combined Inference: NOVA Classification + Anomaly Detection¶

Save Artifacts¶

AI Use Disclosure¶