%pip install -q -r requirements.txt

import warnings
warnings.filterwarnings("ignore")

# force-reload src.eda modules so code changes are always picked up
# (without this, "Run All" reuses stale cached imports)
%load_ext autoreload
%autoreload 2

from IPython.display import display
import subprocess
import os

from src.eda import (
    COMPARE_COLS,
    DEFAULT_LIGHT_DATASET_PATH,
    FULL_DATASET_PATH,
    GRADE_ORDER,
    NOVA_ORDER,
    OpenFoodFactsEDADataLoader,
    OpenFoodFactsEDAPlotter,
    cap_outliers,
    compute_high_correlation_pairs,
    compute_kruskal_summary,
    impute_with_global_median,
    print_dataset_overview,
)

loader = OpenFoodFactsEDADataLoader()
plotter = OpenFoodFactsEDAPlotter()

DATASET_PATH = FULL_DATASET_PATH

# create our light dataset if it does not exist
# we can run the final trial on a larger portion of the dataset or all of it
if not os.path.exists(DATASET_PATH):
    print("Light dataset not found. Creating a sampled dataset...")

    subprocess.run([
        "python",
        "./scripts/create_light_dataset.py",
        "--local",
        "--random",
        "--target-rows",
        "500000"
    ])

    print("Light dataset created.")

print(f"Loading dataset from: {DATASET_PATH}")

Loading dataset from: dataset\en.openfoodfacts.org.products.csv

dataset = loader.load(DATASET_PATH)
df = dataset.df
NUTRIENT_COLS = dataset.nutrient_cols
META_COLS = dataset.meta_cols

# drop ultra-sparse features
drop_cols = [
    "trans-fat_100g",
    "monounsaturated-fat_100g",
    "polyunsaturated-fat_100g",
    "starch_100g"
]

df = df.drop(columns=drop_cols, errors="ignore")

# update nutrient columns FIRST
NUTRIENT_COLS = [c for c in NUTRIENT_COLS if c not in drop_cols]

# remove invalid negative nutritional values
for col in NUTRIENT_COLS:
    df = df[df[col].isna() | (df[col] >= 0)]

print_dataset_overview(dataset)
df.head(3)

Detected delimiter: TAB
Dataset path: dataset\en.openfoodfacts.org.products.csv
Shape: (718492, 28)
Loaded columns (28): ['added-sugars_100g', 'additives_n', 'additives_tags', 'brands', 'carbohydrates_100g', 'categories_en', 'code', 'countries_en', 'energy_100g', 'fat_100g', 'fiber_100g', 'ingredients_analysis_tags', 'ingredients_text', 'monounsaturated-fat_100g', 'nova_group', 'nutriscore_score', 'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'polyunsaturated-fat_100g', 'product_name', 'proteins_100g', 'salt_100g', 'saturated-fat_100g', 'sodium_100g', 'starch_100g', 'sugars_100g', 'trans-fat_100g']
Nutrient columns used in EDA (14): ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'trans-fat_100g', 'added-sugars_100g', 'monounsaturated-fat_100g', 'polyunsaturated-fat_100g', 'starch_100g']

Nutri-Score distribution:
nutrition_grade_fr
a        58473
b        47361
c        95796
d       103079
e       107571
<NA>    306212
Name: count, dtype: Int64

NOVA distribution:
nova_group
1        37036
2        13573
3        62473
4       194012
<NA>    411398
Name: count, dtype: Int64

# verify to ensure that we are not seeing the sparse 
# features we dropped from the previous block
print("Updated nutrient columns:", NUTRIENT_COLS)

Updated nutrient columns: ['energy_100g', 'fat_100g', 'saturated-fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g', 'proteins_100g', 'salt_100g', 'sodium_100g', 'added-sugars_100g']

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
Index: 718325 entries, 798 to 4436578
Data columns (total 24 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   code                       718325 non-null  object 
 1   product_name               708294 non-null  object 
 2   brands                     531381 non-null  object 
 3   categories_en              472779 non-null  object 
 4   countries_en               717894 non-null  object 
 5   ingredients_text           337385 non-null  object 
 6   ingredients_analysis_tags  346320 non-null  object 
 7   additives_n                337386 non-null  float64
 8   additives_tags             183678 non-null  object 
 9   nutriscore_score           412156 non-null  float64
 10  nutrition_grade_fr         412156 non-null  string 
 11  nova_group                 306932 non-null  Int64  
 12  pnns_groups_1              718325 non-null  object 
 13  pnns_groups_2              718325 non-null  object 
 14  energy_100g                695312 non-null  float64
 15  fat_100g                   689555 non-null  float64
 16  saturated-fat_100g         673682 non-null  float64
 17  carbohydrates_100g         689364 non-null  float64
 18  sugars_100g                677249 non-null  float64
 19  added-sugars_100g          347432 non-null  float64
 20  fiber_100g                 310560 non-null  float64
 21  proteins_100g              690356 non-null  float64
 22  salt_100g                  664787 non-null  float64
 23  sodium_100g                664787 non-null  float64
dtypes: Int64(1), float64(12), object(10), string(1)
memory usage: 664.5 MB

df[NUTRIENT_COLS].describe().T.round(2)

# cap extreme outliers (1st–99th percentile)
# we can can oberse from our previous block we have extreme outliers 
df_capped, out_df = cap_outliers(df, NUTRIENT_COLS)
df = df_capped.copy()
df[NUTRIENT_COLS].describe().T.round(2)

plotter.plot_missingness_overview(df)

Duplicate rows       : 0
Columns >50% missing : 7
Columns >80% missing : 0

plotter.plot_missingno_matrix(df, NUTRIENT_COLS)

plotter.plot_nutriscore_overview(df)

Nutri-Score coverage: 57.4% of loaded rows

plotter.plot_nova_overview(df)
plotter.plot_category_overview(df)

NOVA coverage: 42.7% of loaded rows

plotter.plot_nova_nutriscore_heatmap(df)
plotter.plot_nova_nutriscore_stacked_share(df)

plotter.plot_nutrient_distributions(df, NUTRIENT_COLS)

plotter.plot_nutrients_by_group(
    df,
    NUTRIENT_COLS,
    group_col="nutrition_grade_fr",
    order=GRADE_ORDER,
    palette=plotter.grade_palette,
    title="Nutrient Distributions by Nutri-Score Grade",
)

plotter.plot_nutrients_by_group(
    df,
    NUTRIENT_COLS,
    group_col="nova_group",
    order=NOVA_ORDER,
    palette=plotter.nova_palette,
    title="Nutrient Distributions by NOVA Group",
)

nutrient_data = df[NUTRIENT_COLS].dropna(thresh=int(0.7 * len(NUTRIENT_COLS)))
pearson, spearman, high_corr_df = compute_high_correlation_pairs(
    nutrient_data,
    NUTRIENT_COLS,
)

plotter.plot_correlation_matrices(pearson, spearman, NUTRIENT_COLS)

if high_corr_df.empty:
    print("No pairs exceed |r| = 0.85.")
else:
    display(high_corr_df)

plotter.plot_additives_overview(df)

plotter.plot_top_additives(df)

plotter.plot_nutrients_by_group(
    df,
    NUTRIENT_COLS,
    group_col="nutrition_grade_fr",
    order=GRADE_ORDER,
    palette=plotter.grade_palette,
    title="Nutrient Distributions (Violin) by Nutri-Score Grade",
    chart="violin",
)

kw_grade_df = compute_kruskal_summary(
    df,
    NUTRIENT_COLS,
    group_col="nutrition_grade_fr",
    group_order=GRADE_ORDER,
)
plotter.plot_kruskal_summary(
    kw_grade_df,
    "Statistical Separation of Nutrients across Nutri-Score Grades\n(red = p < 0.05)",
)
display(kw_grade_df)

kw_nova_df = compute_kruskal_summary(
    df,
    NUTRIENT_COLS,
    group_col="nova_group",
    group_order=NOVA_ORDER,
)
plotter.plot_kruskal_summary(
    kw_nova_df,
    "Statistical Separation of Nutrients across NOVA Groups\n(red = p < 0.05)",
)
display(kw_nova_df)

df_capped, out_df = cap_outliers(df, NUTRIENT_COLS)
display(out_df)
plotter.plot_outlier_boxplots(df, df_capped, NUTRIENT_COLS)

plotter.plot_geo_category_distribution(df)

df_imp = df.copy()

for col in NUTRIENT_COLS:
    df_imp[col] = df_imp.groupby("pnns_groups_1")[col].transform(
        lambda x: x.fillna(x.median())
    )

print("Imputation Summary")
plotter.plot_imputation_comparison(df_capped, df_imp, COMPARE_COLS)

Imputation Summary

print(f"Rows before cleaning : {len(df_imp):,}")

df_clean = df_imp.copy() # final dataset after clearning + imputation 

# ensure no remaining missing values in features
df_clean = df_clean.dropna(subset=NUTRIENT_COLS)

# drop rows where the target variable (nova_group) is missing.
# rows without a NOVA label cannot contribute to supervised classification.
n_before = len(df_clean)
df_clean = df_clean.dropna(subset=["nova_group"])
n_dropped_nova = n_before - len(df_clean)
print(f"Rows dropped (missing nova_group)   : {n_dropped_nova:,}")

# remove exact duplicate rows to prevent data leakage and inflated metrics.
n_before = len(df_clean)
df_clean = df_clean.drop_duplicates()
n_dropped_dupes = n_before - len(df_clean)
print(f"Rows dropped (exact duplicates)     : {n_dropped_dupes:,}")

# remove duplicate products by barcode, keeping the first occurrence.
# duplicate barcodes indicate the same product scanned multiple times.
if "code" in df_clean.columns:
    n_before = len(df_clean)
    df_clean = df_clean.drop_duplicates(subset=["code"], keep="first")
    n_dropped_code = n_before - len(df_clean)
    print(f"Rows dropped (duplicate barcodes)   : {n_dropped_code:,}")

# drop sodium_100g — it is deterministically derived from salt_100g
# (salt = sodium × 2.5), so retaining both introduces perfect multicollinearity.
if "sodium_100g" in df_clean.columns:
    df_clean = df_clean.drop(columns=["sodium_100g"])

    # update NUTRIENT_COLS as well
    NUTRIENT_COLS = [c for c in NUTRIENT_COLS if c != "sodium_100g"]
    
    print(f"Column dropped (redundant): sodium_100g")



print(f"\nRows after cleaning  : {len(df_clean):,}")
print(f"Total rows removed   : {len(df_imp) - len(df_clean):,}")
print(f"Columns remaining    : {df_clean.shape[1]}")

print("\nNOVA class distribution after cleaning:")
display(
    df_clean["nova_group"]
    .value_counts()
    .sort_index()
    .rename("count")
    .to_frame()
)

Rows before cleaning : 718,325
Rows dropped (missing nova_group)   : 411,392
Rows dropped (exact duplicates)     : 0
Rows dropped (duplicate barcodes)   : 0
Column dropped (redundant): sodium_100g

Rows after cleaning  : 306,932
Total rows removed   : 411,393
Columns remaining    : 23

NOVA class distribution after cleaning:

# verify our NaN count, to ensure they are gone
df_clean[NUTRIENT_COLS].isna().sum()

energy_100g           0
fat_100g              0
saturated-fat_100g    0
carbohydrates_100g    0
sugars_100g           0
fiber_100g            0
proteins_100g         0
salt_100g             0
added-sugars_100g     0
dtype: int64

# ensure target is correct type
df_clean["nova_group"] = df_clean["nova_group"].astype(int)

# create (if not already exists) directory for the processed data
os.makedirs("dataset/processed", exist_ok=True)

# ensure our target variable is correct type
df_clean["nova_group"] = df_clean["nova_group"].astype(int)

# verify final dataset shape and columns
print("Final dataset shape:", df_clean.shape)
print("Columns:", sorted(df_clean.columns.tolist()))

# rename hyphenated columns to underscores for compatibility
df_clean = df_clean.rename(columns={
    "saturated-fat_100g": "saturated_fat_100g",
    "trans-fat_100g": "trans_fat_100g",
    "added-sugars_100g": "added_sugars_100g",
    "monounsaturated-fat_100g": "monounsaturated_fat_100g",
    "polyunsaturated-fat_100g": "polyunsaturated_fat_100g",
})

# barcodes are identifiers, not numbers — store as string to avoid PyArrow overflow
if "code" in df_clean.columns:
    df_clean["code"] = df_clean["code"].astype(str)

# save cleaned dataset
output_path = "dataset/processed/open_food_facts_cleaned.parquet"
df_clean.to_parquet(output_path, index=False)

# confirmation
print(f"\nSaved to: {output_path}")
print(f"Parquet columns ({len(df_clean.columns)}): {sorted(df_clean.columns.tolist())}")

Final dataset shape: (306932, 23)
Columns: ['added-sugars_100g', 'additives_n', 'additives_tags', 'brands', 'carbohydrates_100g', 'categories_en', 'code', 'countries_en', 'energy_100g', 'fat_100g', 'fiber_100g', 'ingredients_analysis_tags', 'ingredients_text', 'nova_group', 'nutriscore_score', 'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'product_name', 'proteins_100g', 'salt_100g', 'saturated-fat_100g', 'sugars_100g']

Saved to: dataset/processed/open_food_facts_cleaned.parquet
Parquet columns (23): ['added_sugars_100g', 'additives_n', 'additives_tags', 'brands', 'carbohydrates_100g', 'categories_en', 'code', 'countries_en', 'energy_100g', 'fat_100g', 'fiber_100g', 'ingredients_analysis_tags', 'ingredients_text', 'nova_group', 'nutriscore_score', 'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'product_name', 'proteins_100g', 'salt_100g', 'saturated_fat_100g', 'sugars_100g']

	code	product_name	brands	categories_en	countries_en	ingredients_text	ingredients_analysis_tags	additives_n	additives_tags	nutriscore_score	nutrition_grade_fr	nova_group	pnns_groups_1	pnns_groups_2	energy_100g	fat_100g	saturated-fat_100g	carbohydrates_100g	sugars_100g	added-sugars_100g	fiber_100g	proteins_100g	salt_100g	sodium_100g
798	0000100000724	Ben's Pure Maple Cream	Ben's Sugar Shack, Jeff de Bruges	NaN	France,World	NaN	NaN	NaN	NaN	NaN	<NA>	<NA>	unknown	unknown	2221.960	30.280	19.300	59.120	37.720	NaN	NaN	5.680	0.270	0.108
841	00001001	pasta	Grappa	Beverages and beverages preparations,Beverages	France,Germany	NaN	NaN	NaN	NaN	5.000	c	<NA>	unknown	unknown	697.100	6.400	1.300	20.000	1.900	NaN	0.800	6.700	0.001	0.000
877	0000101019680	Donut Milka	Milka	Snacks,Sweet snacks,Biscuits and cakes,Cakes,D...	France	NaN	NaN	NaN	NaN	23.000	e	<NA>	Sugary snacks	Biscuits and cakes	1928.500	28.000	13.800	46.500	18.000	NaN	NaN	6.000	0.649	0.260

	count	mean	std	25%	50%	75%	max
energy_100g	695312.000	2510.650	1158989.640	439.000	1053.600	1648.000	966426848.380
fat_100g	689555.000	38.830	21139.360	1.000	6.800	20.900	17554003.530
saturated-fat_100g	673682.000	6.050	718.550	0.200	1.900	7.000	588000.000
carbohydrates_100g	689364.000	48.390	17439.640	3.300	14.100	51.800	14479774.250
sugars_100g	677249.000	14778.730	12151385.900	0.720	3.800	16.670	10000000000.000
fiber_100g	310560.000	35.350	17944.360	0.000	1.500	3.640	10000000.000
proteins_100g	690356.000	1463.630	1203558.460	2.000	6.300	12.500	1000000000.000
salt_100g	664787.000	332.290	269236.980	0.070	0.490	1.280	219520780.940
sodium_100g	664787.000	132.920	107694.790	0.030	0.190	0.510	87808312.380
added-sugars_100g	347432.000	287834.800	169654386.010	0.000	0.000	7.400	100000000000.000

	count	mean	std	25%	50%	75%	max
energy_100g	695312.000	1107.910	757.790	439.000	1053.600	1648.000	3389.180
fat_100g	689555.000	13.210	16.650	1.000	6.800	20.900	91.500
saturated-fat_100g	673682.000	4.890	6.610	0.200	1.900	7.000	28.400
carbohydrates_100g	689364.000	27.210	27.280	3.300	14.100	51.800	93.000
sugars_100g	677249.000	12.980	18.870	0.720	3.800	16.670	80.000
fiber_100g	310560.000	2.920	4.290	0.000	1.500	3.640	25.700
proteins_100g	690356.000	8.800	9.070	2.000	6.300	12.500	50.000
salt_100g	664787.000	0.990	1.680	0.070	0.490	1.280	12.000
sodium_100g	664787.000	0.400	0.670	0.030	0.190	0.510	4.800
added-sugars_100g	347432.000	8.420	17.260	0.000	0.000	7.400	82.750

	feature	H-statistic
0	energy_100g	102520.300
2	saturated-fat_100g	92870.400
1	fat_100g	78510.800
4	sugars_100g	57979.000
7	salt_100g	56948.800
8	sodium_100g	56669.200
9	added-sugars_100g	51389.400
3	carbohydrates_100g	24156.500
6	proteins_100g	17459.400
5	fiber_100g	7495.500

	feature	H-statistic
9	added-sugars_100g	60660.400
7	salt_100g	54803.800
8	sodium_100g	54797.900
6	proteins_100g	28022.400
4	sugars_100g	23004.500
0	energy_100g	17792.000
1	fat_100g	16116.800
2	saturated-fat_100g	14651.600
3	carbohydrates_100g	14500.500
5	fiber_100g	9242.100

Context Aware Nutritional Assessment¶

Predicting Food Processing Tiers through Machine Learning¶

Notebook: 01 - Exploratory Data Analysis¶

AAI-590 Capstone Project - University of San Diego¶

Team Members:¶

Objective¶

Exploratory Data Analysis¶

1. Import Libraries and Configuration¶

2. Load and Inspect Dataset¶

3. Data Quality Assessment¶

4. Target Variable Analysis: Nutritional Quality and Processing Level¶

5. Nutritional Feature Distributions¶

6. Correlation and Multicollinearity Analysis¶

7. Additive and Ingredient Analysis¶

8. Feature Relationships with Nutritional Quality and Processing Level¶

9. Outlier Detection¶

10. Geographic and Category Distribution¶

11. Missing Data Imputation Strategy¶

12. Basic Data¶

13. Save and Clean-Up¶

Exploratory Data Analysis Summary¶

AI Use Disclosure¶