Redback-Operations · SHASHANKVAMP · Sep 24, 2025
@@ -0,0 +1,53 @@
+# Childhood Obesity Classification – Lachesis (Shashank Samyal)
+
+This package contains my individual contribution to the Lachesis Capstone project:
+- **Preprocessing + Ethics** (age banding, BMI, sensitive attribute handling)
+- **Model training** (Random Forest baseline, XGBoost benchmark)
+- **Reproducible outputs** (metrics, confusion matrix, feature importances)
+- **Research** (Geospatial tools for public health; ethics/legality audit)
+
+## 📂 Structure
+final-upload/
+├─ data/ # official + supporting datasets (see privacy note)
+├─ notebooks/ # preprocessing + XGBoost training
+├─ outputs/ # saved metrics & plots (evidence)
+├─ scripts/ # script versions of the pipeline
+└─ reports/ # PDFs: model report + geospatial research
+
+
+## 🛠️ How to run
+1) Install deps:
+```bash
+pip install numpy pandas scikit-learn xgboost matplotlib
+Open notebooks:
+
+
+jupyter notebook notebooks/pre_processing.ipynb
+jupyter notebook notebooks/week9_xgboost.ipynb
+Data path:
+
+Place Final_combined_dataset.csv into data/ (or update CSV_PATH at the top of the notebook).
+
+📊 Results (Week 9, XGBoost)
+Test Accuracy: 0.8585
+
+CV Mean Accuracy: 0.8665 (±0.0073)
+
+Classes: 7, Features: 25
+Artifacts in outputs/:
+classification_report.csv, confusion_matrix.png, feature_importances_top.png, summary_metrics.csv, label_classes.csv.
+
+🧭 Geospatial Research
+See reports/Geospatial_Tools_Research.pdf for a comparison of ArcGIS, GeoPandas, Kepler.gl, and QGIS, with recommendations for Lachesis (analysis vs. visualisation vs. cost/integration).
+
+🔐 Privacy & Ethics
+No direct identifiers used.
+
+Age anonymised into bands; BMI derived from height/weight.
+
+See data/legality_ethics_audit_findings.csv and reports/Obesity_Model_Report.pdf.
+
+👤 Author
+Shashank Samyal — ML pipeline & ethics preprocessing (Distinction level submission).
+
+
@@ -0,0 +1,4 @@
+Category,Columns found
+Direct identifiers,(none)
+Quasi-identifiers,age
+Sensitive fields,"height, weight"
@@ -0,0 +1,11 @@
+,precision,recall,f1-score,support
+0,0.9411764705882353,0.9411764705882353,0.9411764705882353,221.0
+1,0.8439024390243902,0.8963730569948186,0.8693467336683417,193.0
+2,0.9148936170212766,0.9148936170212766,0.9148936170212766,94.0
+3,0.0,0.0,0.0,1.0
+4,0.0,0.0,0.0,0.0
+5,0.6065573770491803,0.578125,0.592,64.0
+6,0.7741935483870968,0.6857142857142857,0.7272727272727273,70.0
+accuracy,0.8584758942457231,0.8584758942457231,0.8584758942457231,0.8584758942457231
+macro avg,0.5829604931528828,0.5737546329026594,0.577812792650083,643.0
+weighted avg,0.8551887888801737,0.8584758942457231,0.8562690676626451,643.0
@@ -0,0 +1,7 @@
+208,13,0,0,0,0,0
+10,173,0,0,0,8,2
+1,0,86,0,0,3,4
+0,0,1,0,0,0,0
+0,0,0,0,0,0,0
+2,16,1,0,0,37,8
+0,3,6,0,0,13,48
@@ -0,0 +1,4 @@
+cv_accuracy
+0.8588098016336057
+0.8763127187864644
+0.8644859813084113
@@ -0,0 +1,26 @@
+,0
+bmi,0.26418245
+weight,0.07700983
+scc,0.07038342
+favc,0.06095557
+gender_male,0.055255312
+ncp,0.054393437
+age,0.03890668
+tue,0.03526925
+caec_always,0.03361133
+caec_frequently,0.03208275
+mtrans_public_transportation,0.029494468
+height,0.02882162
+caec_sometimes,0.028653089
+mtrans_walking,0.027856175
+family_history_with_overweight,0.027728962
+calc_frequently,0.025763191
+ch2o,0.02544201
+fcvc,0.024752429
+faf,0.023884656
+calc_sometimes,0.023301115
+calc_no,0.012252272
+smoke,0.0
+mtrans_bike,0.0
+mtrans_motorbike,0.0
+ageband_0-18,0.0
@@ -0,0 +1,8 @@
+class_names
+0
+1
+2
+3
+4
+5
+6
@@ -0,0 +1,6 @@
+,0
+test_accuracy,0.8585
+cv_mean_accuracy,0.8665
+cv_std,0.0073
+num_features,25.0
+num_classes,7.0
@@ -0,0 +1,42 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score, classification_report
+
+# 1. Load the dataset
+df = pd.read_csv("childhood_obesity.csv")
+
+# 2. Check and clean data
+df = df.dropna()
+
+# 3. Define features (X) and target (y)
+# Change 'Obesity' to the actual column name for the label in your dataset
+X = df.drop("Obesity", axis=1)
+y = df["Obesity"]
+
+# 4. Encode categorical features
+X = pd.get_dummies(X, drop_first=True)
+
+# 5. Split into train and test
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42
+)
+
+# 6. Train model
+model = RandomForestClassifier(n_estimators=200, random_state=42)
+model.fit(X_train, y_train)
+
+# 7. Predict
+y_pred = model.predict(X_test)
+
+# 8. Metrics
+acc = accuracy_score(y_test, y_pred)
+print(f"Accuracy: {acc*100:.2f}%")
+print("\nClassification Report:\n", classification_report(y_test, y_pred))
+
+# 9. Save predictions for sharing
+output = X_test.copy()
+output["Actual"] = y_test
+output["Predicted"] = y_pred
+output.to_csv("obesity_predictions.csv", index=False)
+print("Predictions saved to obesity_predictions.csv")
@@ -0,0 +1,80 @@
+import pandas as pd
+from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
+from sklearn.preprocessing import OneHotEncoder, LabelEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.ensemble import RandomForestClassifier
+
+# --- load ---
+df = pd.read_csv("obesity.csv")
+
+# --- target & features ---
+TARGET = "NObeyesdad"  # e.g., Normal_Weight, Overweight_Level_I, Obesity_Type_I, ...
+assert TARGET in df.columns, f"{TARGET} not found. Columns: {list(df.columns)}"
+
+# drop rows with missing values (dataset is usually clean)
+df = df.dropna().reset_index(drop=True)
+
+# split X/y
+y_text = df[TARGET].astype(str)
+X = df.drop(columns=[TARGET])
+
+# identify categorical vs numeric
+cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
+num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()
+
+# encode target labels (string -> int) for metrics stability
+y_le = LabelEncoder()
+y = y_le.fit_transform(y_text)
+
+# preprocessors
+pre = ColumnTransformer(
+    transformers=[
+        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
+        ("num", "passthrough", num_cols),
+    ],
+    remainder="drop",
+)
+
+# model
+clf = RandomForestClassifier(
+    n_estimators=400,
+    max_depth=None,
+    random_state=42,
+    n_jobs=-1,
+)
+
+pipe = Pipeline(steps=[("prep", pre), ("model", clf)])
+
+# ---- train/valid split (stratified) ----
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42, stratify=y
+)
+
+# cross-val accuracy (more credible than single split)
+cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+cv_scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1)
+print(f"CV Accuracy (mean ± std): {cv_scores.mean():.3f} ± {cv_scores.std():.3f}")
+
+# fit and evaluate on held-out test
+pipe.fit(X_train, y_train)
+y_pred = pipe.predict(X_test)
+acc = accuracy_score(y_test, y_pred)
+print(f"Test Accuracy: {acc:.3f}")
+print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=y_le.classes_))
+print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
+
+# save predictions you can show
+out = X_test.copy()
+out["Actual"] = y_le.inverse_transform(y_test)
+out["Predicted"] = y_le.inverse_transform(y_pred)
+out.to_csv("obesity_predictions.csv", index=False)
+print("\nSaved predictions -> obesity_predictions.csv")
+
+# small tip on interpretation
+topline = (
+    f"CV Acc: {cv_scores.mean():.1%} (±{cv_scores.std():.1%}) | "
+    f"Test Acc: {acc:.1%} | n_test={len(y_test)}"
+)
+print("\nSUMMARY:", topline)