import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
)
from xgboost import XGBClassifier

from pathlib import Path
import shutil
import json

df = pd.read_parquet("dataset/processed/open_food_facts_cleaned.parquet")

print("Dataset shape:", df.shape)
df.head(3)

Dataset shape: (306932, 23)

# define our target
TARGET = "nova_group"

# define our numeric features
NUMERIC_FEATURES = [
    "energy_100g",
    "fat_100g",
    "carbohydrates_100g",
    "sugars_100g",
    "fiber_100g",
    "proteins_100g",
    "salt_100g",
    "saturated_fat_100g",
    "additives_n",
    "added_sugars_100g",
]

# only keep features that exist in the loaded parquet
available = [c for c in NUMERIC_FEATURES if c in df.columns]
missing = [c for c in NUMERIC_FEATURES if c not in df.columns]
if missing:
    print(f"Note: {len(missing)} features not in dataset (re-run EDA to add): {missing}")
NUMERIC_FEATURES = available

# create our modeling dataframe
df_model = df[NUMERIC_FEATURES + [TARGET]].copy()

# validate target: drop rows with missing or unexpected nova_group values
valid_nova = {1, 2, 3, 4}
n_before = len(df_model)
df_model = df_model[df_model[TARGET].isin(valid_nova)]
print(f"Rows dropped (invalid/missing nova_group): {n_before - len(df_model):,}")

# drop rows where ALL numeric features are NaN (no usable data)
n_before = len(df_model)
df_model = df_model.dropna(subset=NUMERIC_FEATURES, how="all")
print(f"Rows dropped (all features NaN): {n_before - len(df_model):,}")

# fill remaining NaN in sparse columns with 0 (absence of data = no recorded value)
sparse_cols = [
    c for c in [
        "trans_fat_100g",
        "added_sugars_100g",
        "monounsaturated_fat_100g",
        "polyunsaturated_fat_100g",
        "starch_100g",
    ] if c in NUMERIC_FEATURES
]
if sparse_cols:
    df_model[sparse_cols] = df_model[sparse_cols].fillna(0)

# fill any remaining NaNs in core nutrient columns with their median
core_cols = [c for c in NUMERIC_FEATURES if c not in sparse_cols and c != "nutriscore_score"]
for col in core_cols:
    if df_model[col].isna().any():
        df_model[col] = df_model[col].fillna(df_model[col].median())

# verify no NaN remains
assert df_model[NUMERIC_FEATURES].isna().sum().sum() == 0, "NaN values remain in features!"
assert df_model[TARGET].isna().sum() == 0, "NaN values remain in target!"

print(f"\nFeatures used ({len(NUMERIC_FEATURES)}): {NUMERIC_FEATURES}")
print(f"Model dataset shape: {df_model.shape}")
print(f"NaN remaining: {df_model.isna().sum().sum()}")
df_model.head(3)

Rows dropped (invalid/missing nova_group): 0
Rows dropped (all features NaN): 0

Features used (10): ['energy_100g', 'fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'saturated_fat_100g', 'additives_n', 'added_sugars_100g']
Model dataset shape: (306932, 11)
NaN remaining: 0

# avoid division by zero, in case any of our values are zero
# from the nutrional label. A small epsilon will take care of this
epsilon = 1e-5

# sugar to fiber ratio (processed foods tend to be high sugar, low fiber)
#df_model["sugar_fiber_ratio"] = df_model["sugars_100g"] / (df_model["fiber_100g"] + epsilon)
df_model["sugar_fiber_ratio"] = df_model["sugars_100g"] / (df_model["fiber_100g"].replace(0, 1))

# fat to protein ratio (helps distinguish nutrient balance)
# again, processed foods tend of have higher fat relative to protein content 
df_model["fat_protein_ratio"] = df_model["fat_100g"] / (df_model["proteins_100g"] + epsilon)

# additive density relative to calories
# higher value can indicate many additives for the caloric count
df_model["additives_per_energy"] = df_model["additives_n"] / (df_model["energy_100g"] + 1)

# trans fat as a share of total fat — marker of industrial hydrogenation
if "trans_fat_100g" in df_model.columns:
    df_model["trans_fat_ratio"] = df_model["trans_fat_100g"] / (df_model["fat_100g"] + epsilon)

# unsaturated fat share of total fat — natural fats vs processed
if "monounsaturated_fat_100g" in df_model.columns and "polyunsaturated_fat_100g" in df_model.columns:
    df_model["unsaturated_fat_ratio"] = (
        df_model["monounsaturated_fat_100g"] + df_model["polyunsaturated_fat_100g"]
    ) / (df_model["fat_100g"] + epsilon)

# cap engineered ratios at the 99th percentile to prevent extreme outliers
# from dominating the model (near-zero denominators can produce huge values)
ENGINEERED_RATIOS = [
    c for c in [
        "sugar_fiber_ratio",
        "fat_protein_ratio",
        "additives_per_energy",
        "trans_fat_ratio",
        "unsaturated_fat_ratio",
    ] if c in df_model.columns
]

for col in ENGINEERED_RATIOS:
    cap = df_model[col].quantile(0.99)
    n_capped = (df_model[col] > cap).sum()
    df_model[col] = df_model[col].clip(upper=cap)
    if n_capped > 0:
        print(f"  {col}: capped {n_capped:,} values at {cap:.2f}")

print(f"\nUpdated dataset shape: {df_model.shape}")
df_model.head(3)

  sugar_fiber_ratio: capped 2,990 values at 160.00
  additives_per_energy: capped 3,068 values at 0.50

Updated dataset shape: (306932, 14)

# post-engineering correlation check to detect redundant features
corr = df_model.drop(columns=[TARGET]).corr()

fig, ax = plt.subplots(figsize=(14, 10))
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=True, fmt=".2f", cmap="coolwarm",
            center=0, vmin=-1, vmax=1, linewidths=0.5, ax=ax)
ax.set_title("Feature Correlation Matrix (Post-Engineering)", fontsize=14)
plt.tight_layout()
plt.show()

# flag highly correlated pairs (|r| > 0.85)
high_corr = []
cols = corr.columns
for i in range(len(cols)):
    for j in range(i + 1, len(cols)):
        r = corr.iloc[i, j]
        if abs(r) > 0.85:
            high_corr.append((cols[i], cols[j], round(r, 3)))

if high_corr:
    print("Highly correlated pairs (|r| > 0.85):")
    for a, b, r in high_corr:
        print(f"  {a} <-> {b}: {r}")
else:
    print("No feature pairs exceed |r| = 0.85.")

No feature pairs exceed |r| = 0.85.

# class distribution
class_counts = df_model[TARGET].value_counts().sort_index()
class_pct = (class_counts / len(df_model) * 100).round(1)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# bar chart
class_counts.plot(kind="bar", ax=axes[0], color=["#2ecc71", "#3498db", "#f39c12", "#e74c3c"])
axes[0].set_title("NOVA Group Distribution (Counts)")
axes[0].set_xlabel("NOVA Group")
axes[0].set_ylabel("Count")
axes[0].tick_params(axis="x", rotation=0)

# percentage table
for i, (nova, count) in enumerate(class_counts.items()):
    axes[1].text(0.2, 0.8 - i * 0.2, f"NOVA {nova}:", fontsize=12, fontweight="bold",
                 transform=axes[1].transAxes)
    axes[1].text(0.6, 0.8 - i * 0.2, f"{count:,}  ({class_pct[nova]}%)", fontsize=12,
                 transform=axes[1].transAxes)
axes[1].axis("off")
axes[1].set_title("Class Balance Summary")

plt.tight_layout()
plt.show()

# compute class weights for XGBoost (inverse frequency)
total = len(df_model)
n_classes = len(class_counts)
class_weights = {int(k - 1): total / (n_classes * v) for k, v in class_counts.items()}
print("Class weights (0-indexed for XGBoost):", {k: round(v, 3) for k, v in class_weights.items()})

Class weights (0-indexed for XGBoost): {0: 2.072, 1: 5.653, 2: 1.228, 3: 0.396}

# Split features and target
X = df_model.drop(columns=["nova_group"])
y = df_model["nova_group"]

# doing this to satisfy XGBoost's requirement of needing to start from 0
y = y - 1

# Step 1: Train (60%) + Temp (40%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X,
    y,
    test_size=0.4,
    stratify=y,
    random_state=42
)

# Step 2: Split Temp into Validation (20%) and Test (20%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp,
    y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape)

Train shape: (184159, 13)
Validation shape: (61386, 13)
Test shape: (61387, 13)

# initialize scaler
scaler = StandardScaler()

# fit on training data only
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("Scaled train shape:", X_train_scaled.shape)
print("Scaled validation shape:", X_val_scaled.shape)
print("Scaled test shape:", X_test_scaled.shape)

Scaled train shape: (184159, 13)
Scaled validation shape: (61386, 13)
Scaled test shape: (61387, 13)

# compute per-sample weights from class_weights dict
train_sample_weights = y_train.map(class_weights)

# initial baseline model with class-weight-aware training
xgb_model = XGBClassifier(
    objective="multi:softmax",
    num_class=4,
    eval_metric="mlogloss",
    random_state=42,
)

xgb_model.fit(X_train_scaled, y_train, sample_weight=train_sample_weights)

y_val_pred = xgb_model.predict(X_val_scaled)

print("Baseline XGBoost model trained.")

Baseline XGBoost model trained.

# NOVA labels (0-indexed to match y encoding)
nova_labels = ["NOVA 1", "NOVA 2", "NOVA 3", "NOVA 4"]

# classification report
print("Validation Set — Classification Report")
print(classification_report(y_val, y_val_pred, target_names=nova_labels))

# accuracy
val_acc = accuracy_score(y_val, y_val_pred)
print(f"Overall Validation Accuracy: {val_acc:.4f}")

# confusion matrix
fig, ax = plt.subplots(figsize=(8, 6))
cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=nova_labels)
disp.plot(cmap="Blues", ax=ax, values_format="d")
ax.set_title("XGBoost Baseline — Confusion Matrix (Validation Set)")
plt.tight_layout()
plt.show()

# per-class accuracy
per_class_acc = cm.diagonal() / cm.sum(axis=1)
for label, acc in zip(nova_labels, per_class_acc):
    print(f"  {label} accuracy: {acc:.3f}")

Validation Set — Classification Report
              precision    recall  f1-score   support

      NOVA 1       0.77      0.93      0.84      7407
      NOVA 2       0.83      0.95      0.88      2715
      NOVA 3       0.65      0.82      0.73     12492
      NOVA 4       0.96      0.83      0.89     38772

    accuracy                           0.85     61386
   macro avg       0.80      0.88      0.84     61386
weighted avg       0.87      0.85      0.85     61386

Overall Validation Accuracy: 0.8479

  NOVA 1 accuracy: 0.929
  NOVA 2 accuracy: 0.950
  NOVA 3 accuracy: 0.824
  NOVA 4 accuracy: 0.833

# feature importance from XGBoost (gain-based)
importance = pd.Series(
    xgb_model.feature_importances_,
    index=X_train.columns
).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 8))
importance.plot(kind="barh", ax=ax, color= "#3498db")
ax.set_title("XGBoost Feature Importance (Gain)", fontsize=14)
ax.set_xlabel("Importance Score")
plt.tight_layout()
plt.show()

# top 10 features
print("Top 10 features:")
for feat, score in importance.tail(10).iloc[::-1].items():
    print(f"  {feat}: {score:.4f}")

Top 10 features:
  additives_n: 0.4656
  fat_100g: 0.0982
  added_sugars_100g: 0.0950
  proteins_100g: 0.0747
  salt_100g: 0.0632
  energy_100g: 0.0411
  sugars_100g: 0.0409
  additives_per_energy: 0.0305
  fiber_100g: 0.0220
  fat_protein_ratio: 0.0200

# define artifacts directory
ARTIFACTS_DIR = Path("results/features")

# delete existing artifacts to avoid stale files
if ARTIFACTS_DIR.exists():
    shutil.rmtree(ARTIFACTS_DIR)

# recreate clean directory
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# save model
xgb_model.save_model(ARTIFACTS_DIR / "xgb_baseline.json")
print(f"Saved: {ARTIFACTS_DIR / 'xgb_baseline.json'}")

# save scaler
joblib.dump(scaler, ARTIFACTS_DIR / "standard_scaler.joblib")
print(f"Saved: {ARTIFACTS_DIR / 'standard_scaler.joblib'}")

# save feature engineering metadata
fe_metadata = {
    "original_features": [
        "energy_100g", "fat_100g", "carbohydrates_100g", "sugars_100g",
        "fiber_100g", "proteins_100g", "salt_100g",
        "saturated_fat_100g", "additives_n", "added_sugars_100g"
    ],
    "engineered_features": [
        "sugar_fiber_ratio", "fat_protein_ratio", "additives_per_energy"
    ],
    "features_removed_in_eda": [
        "sodium_100g", "trans_fat_100g",
        "monounsaturated_fat_100g", "polyunsaturated_fat_100g", "starch_100g"
    ]
}

with open(ARTIFACTS_DIR / "feature_engineering_metadata.json", "w") as f:
    json.dump(fe_metadata, f, indent=2)

print(f"Saved: {ARTIFACTS_DIR / 'feature_engineering_metadata.json'}")

# save splits
splits = {
    "X_train": X_train, "X_val": X_val, "X_test": X_test,
    "y_train": y_train, "y_val": y_val, "y_test": y_test,
    "X_train_scaled": X_train_scaled,
    "X_val_scaled": X_val_scaled,
    "X_test_scaled": X_test_scaled,
    "feature_names": list(X_train.columns),
    "class_weights": class_weights,
    "id_mapping": {0: 1, 1: 2, 2: 3, 3: 4}
}

joblib.dump(splits, ARTIFACTS_DIR / "data_splits.joblib")
print(f"Saved: {ARTIFACTS_DIR / 'data_splits.joblib'}")

# confirm path is correct
print(f"\nAll artifacts saved to: {ARTIFACTS_DIR.resolve()}")

Saved: results\features\xgb_baseline.json
Saved: results\features\standard_scaler.joblib
Saved: results\features\feature_engineering_metadata.json
Saved: results\features\data_splits.joblib

All artifacts saved to: C:\Source\AIML\aai590_5_capstone_project\results\features

	code	product_name	brands	categories_en	countries_en	ingredients_text	ingredients_analysis_tags	additives_n	additives_tags	nutriscore_score	...	pnns_groups_2	energy_100g	fat_100g	saturated_fat_100g	carbohydrates_100g	sugars_100g	added_sugars_100g	fiber_100g	proteins_100g	salt_100g
0	0000101903605	Le Petit Bridé	Teyssier	Meats and their products,Meats,Prepared meats,...	France	Viande de porc 151g pour 100g, sel, sucres (LA...	en:palm-oil-free,en:non-vegan,en:vegetarian-st...	0.0	NaN	NaN	...	Processed meat	802.000000	11.0	3.0	0.9	0.5	12.50000	0.0	19.0	1.50
1	0000105753078	Bread	NaN	Plant-based foods and beverages,Plant-based fo...	United States	Water, white flour, honey, corn syrup, yeast, ...	en:palm-oil-free,en:non-vegan,en:vegetarian-st...	0.0	NaN	NaN	...	Bread	1451.219512	3.5	0.5	54.0	3.0	16.40625	3.9	9.3	0.71
2	00001065	boisson à l'aloe vera	Herbalife	Plant-based foods and beverages,Beverages,Meal...	France,Spain	NGREDIENTES: agua, ácido cítrico anhidro, sabo...	en:palm-oil-free,en:vegan-status-unknown,en:ve...	3.0	en:e211,en:e330,en:e955	NaN	...	Artificially sweetened beverages	51.000000	0.0	0.0	3.0	0.0	0.00000	0.0	0.0	0.10

	energy_100g	fat_100g	carbohydrates_100g	sugars_100g	fiber_100g	proteins_100g	salt_100g	saturated_fat_100g	additives_n	added_sugars_100g	nova_group	sugar_fiber_ratio	fat_protein_ratio	additives_per_energy
0	802.000000	11.0	0.9	0.5	0.0	19.0	1.50	3.0	0.0	12.50000	4	0.500000	0.578947	0.000000
1	1451.219512	3.5	54.0	3.0	3.9	9.3	0.71	0.5	0.0	16.40625	3	0.769231	0.376344	0.000000
2	51.000000	0.0	3.0	0.0	0.0	0.0	0.10	0.0	3.0	0.00000	4	0.000000	0.000000	0.057692

Context Aware Nutritional Assessment¶

Predicting Food Processing Tiers through Machine Learning¶

Notebook: 02 - Feature Engineering and Model Preparation¶

AAI-590 Capstone Project - University of San Diego¶

Team Members:¶

Feature Engineering and Model Preparation¶

Objective:¶

Feature Selection¶

Feature Engineering¶

Post-Engineering Correlation Check¶

Class Distribution¶

Train/Test/Validation Split¶

Feature Scaling¶

XGBoost Baseline Model¶

Model Evaluation¶

Feature Importance¶

Save Artifacts¶

AI Use Disclosure¶