From 1c092549458a44a3fc87034a05b543b616eade63 Mon Sep 17 00:00:00 2001
From: 26497571+bit-scientist <26497571+bit-scientist@users.noreply.github.com>
Date: Sun, 3 Aug 2025 18:52:03 +0900
Subject: [PATCH] Fix forecast plot CI display and improve plotting robustness

---
 Chapter01/forecasting/forecasting_example.py | 155 ++++++++++---------
 1 file changed, 84 insertions(+), 71 deletions(-)

diff --git a/Chapter01/forecasting/forecasting_example.py b/Chapter01/forecasting/forecasting_example.py
index 8c64851..538239f 100644
--- a/Chapter01/forecasting/forecasting_example.py
+++ b/Chapter01/forecasting/forecasting_example.py
@@ -35,17 +35,19 @@ def plot_store_data(df: pd.DataFrame) -> None:
 
         
 def train_predict(
-    df: pd.DataFrame, 
-    train_fraction: float, 
+    df: pd.DataFrame,
+    train_fraction: float,
     seasonality: dict
 ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, int]:
+
+    df['ds'] = pd.to_datetime(df['ds'])
     
     # grab split data
     train_index = int(train_fraction*df.shape[0])
     df_train = df.copy().iloc[0:train_index]
     df_test = df.copy().iloc[train_index:]
 
-    #create Prophet model
+    # create Prophet model
     model=Prophet(
         yearly_seasonality=seasonality['yearly'],
         weekly_seasonality=seasonality['weekly'],
@@ -64,86 +66,97 @@ def print_debug_info(df, name):
     print(df.head())
     print(df.dtypes)
     print(df.isna().sum())
-    
+
 def plot_forecast(df_train: pd.DataFrame, df_test: pd.DataFrame, predicted: pd.DataFrame) -> None:
-    # Ensure data types are consistent
-    # df_train = df_train.apply(pd.to_numeric, errors='coerce')
-    # df_test = df_test.apply(pd.to_numeric, errors='coerce')
-    # predicted = predicted.apply(pd.to_numeric, errors='coerce')
-
-    # Drop or handle NaN values
-    df_train = df_train.dropna()
-    df_test = df_test.dropna()
-    predicted = predicted.dropna()
-    
-    # Print debug information
-    # print_debug_info(df_train, "df_train")
-    # print_debug_info(df_test, "df_test")
-    # print_debug_info(predicted, "predicted")
-    
-    fig, ax = plt.subplots(figsize=(20,10))
+    # Ensure correct dtypes
+    df_train['ds'] = pd.to_datetime(df_train['ds'])
+    df_train['y'] = pd.to_numeric(df_train['y'], errors='coerce')
+
+    df_test['ds'] = pd.to_datetime(df_test['ds'])
+    df_test['y'] = pd.to_numeric(df_test['y'], errors='coerce')
+
+    predicted['ds'] = pd.to_datetime(predicted['ds'])
+    predicted['yhat'] = pd.to_numeric(predicted['yhat'], errors='coerce')
+    predicted['yhat_upper'] = pd.to_numeric(predicted['yhat_upper'], errors='coerce')
+    predicted['yhat_lower'] = pd.to_numeric(predicted['yhat_lower'], errors='coerce')
+
+    # Drop NaNs
+    df_train.dropna(subset=['ds', 'y'], inplace=True)
+    df_test.dropna(subset=['ds', 'y'], inplace=True)
+    predicted.dropna(subset=['ds', 'yhat', 'yhat_upper', 'yhat_lower'], inplace=True)
+
+    fig, ax = plt.subplots(figsize=(20, 10))
+
+    # Plot truth (test)
     df_test.plot(
-        x='ds', 
-        y='y', 
-        ax=ax, 
-        label='Truth', 
-        linewidth=1, 
-        markersize=5, 
+        x='ds',
+        y='y',
+        ax=ax,
+        label='Truth',
+        linewidth=1,
+        markersize=5,
         color='tab:blue',
-        alpha=0.9, 
+        alpha=0.9,
         marker='o'
     )
+
+    # Plot prediction line
     predicted.plot(
-        x='ds', 
-        y='yhat', 
-        ax=ax, 
-        label='Prediction + 95% CI', 
-        linewidth=2, 
-        markersize=5, 
+        x='ds',
+        y='yhat',
+        ax=ax,
+        label='Prediction + 95% CI',
+        linewidth=2,
         color='red'
     )
-    
-    try:
-        # Print lengths and first few values to debug
-        # print(f"Length of predicted['ds']: {len(predicted['ds'])}")
-        # print(f"Length of predicted['yhat_upper']: {len(predicted['yhat_upper'])}")
-        # print(f"Length of predicted['yhat_lower']: {len(predicted['yhat_lower'])}")
-        # print(f"First few values of predicted['ds']: {predicted['ds'].head()}")
-        # print(f"First few values of predicted['yhat_upper']: {predicted['yhat_upper'].head()}")
-        # print(f"First few values of predicted['yhat_lower']: {predicted['yhat_lower'].head()}")
-        ax.fill_between(
-            x=predicted['ds'], 
-            y1=predicted['yhat_upper'], 
-            y2=predicted['yhat_lower'], 
-            alpha=0.15, 
-            color='red',
-        )
-    except Exception as e:
-        print(f"Error in fill_between: {e}")
-        
-    df_train.iloc[train_index-100:].plot(
-        x='ds', 
-        y='y', 
-        ax=ax, 
-        color='tab:blue', 
-        label='_nolegend_', 
-        alpha=0.5, 
+
+    # --- 🔐 Safe fill_between ---
+    ds = pd.to_datetime(predicted['ds']).values
+    y_upper = pd.to_numeric(predicted['yhat_upper'], errors='coerce').values
+    y_lower = pd.to_numeric(predicted['yhat_lower'], errors='coerce').values
+
+    # Create mask for finite values
+    mask = np.isfinite(ds) & np.isfinite(y_upper) & np.isfinite(y_lower)
+    if mask.sum() == 0:
+        raise ValueError("No valid data points for fill_between after cleaning.")
+
+    ax.fill_between(
+        x=ds[mask],
+        y1=y_upper[mask],
+        y2=y_lower[mask],
+        alpha=0.15,
+        color='red'
+    )
+    # ----------------------------
+
+    # Plot last 100 training points
+    df_train_tail = df_train.tail(100)
+    df_train_tail.plot(
+        x='ds',
+        y='y',
+        ax=ax,
+        color='tab:blue',
+        label='_nolegend_',
+        alpha=0.5,
         marker='o'
     )
-    current_ytick_values = plt.gca().get_yticks()
-    plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_ytick_values])
+
+    # Format labels
     ax.set_xlabel('Date')
     ax.set_ylabel('Sales')
-    plt.tight_layout()
-    plt.savefig('store_data_forecast.png')
+    current_yticks = ax.get_yticks()
+    ax.set_yticklabels(['{:,.0f}'.format(yt) for yt in current_yticks])
 
+    plt.tight_layout()
+    plt.savefig('store_data_forecast.png', dpi=150)
+    plt.close()
 
 
 
 if __name__ == "__main__":
     import os
-    
-    # If data present, read it in, otherwise, download it 
+
+    # If data present, read it in, otherwise, download it
     file_path = './train.csv'
     if os.path.exists(file_path):
         logging.info('Dataset found, reading into pandas dataframe.')
@@ -152,30 +165,30 @@ def plot_forecast(df_train: pd.DataFrame, df_test: pd.DataFrame, predicted: pd.D
         logging.info('Dataset not found, downloading ...')
         download_kaggle_dataset()
         logging.info('Reading dataset into pandas dataframe.')
-        df = pd.read_csv(file_path)   
-    
+        df = pd.read_csv(file_path)
+
     # Transform dataset in preparation for feeding to Prophet
     df = prep_store_data(df)
-    
+
     # Define main parameters for modelling
     seasonality = {
         'yearly': True,
         'weekly': True,
         'daily': False
     }
-    
+
     # Calculate the relevant dataframes
     predicted, df_train, df_test, train_index = train_predict(
         df = df,
         train_fraction = 0.8,
         seasonality=seasonality
     )
-    
+
     # Debugging
     # print(df_train.dtypes)
     # print(df_test.dtypes)
     # print(predicted.dtypes)
-    
+
     # Plot the forecast
     plot_forecast(df_train, df_test, predicted)