diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e05409..151fdcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# [0.5.4] - 2025-01-05 +### Changed +* fixed streamlit crash +* added toml for install +* fixed accounted for nan in monthly aggregates + # [0.5.1] - 2023-02-18 ### Changed * created download_utils diff --git a/noaaplotter/scripts/download_data.py b/noaaplotter/scripts/download_data.py index 2c9b7ea..d0778f3 100644 --- a/noaaplotter/scripts/download_data.py +++ b/noaaplotter/scripts/download_data.py @@ -19,8 +19,8 @@ def main(): dest="output_file", type=str, required=True, - default="data/data.csv", - help="csv file to save results", + default="data/parquet.csv", + help="parquet file to save results", ) parser.add_argument( diff --git a/noaaplotter/scripts/plot_daily.py b/noaaplotter/scripts/plot_daily.py index b4dafd2..4387902 100644 --- a/noaaplotter/scripts/plot_daily.py +++ b/noaaplotter/scripts/plot_daily.py @@ -12,7 +12,7 @@ def main(): parser = argparse.ArgumentParser(description='Parse arguments.') parser.add_argument('-infile', dest='infile', type=str, required=True, - default='data/temp.csv', + default='data/temp.parquet', help='input file with climate data') parser.add_argument('-t', dest='token', type=str, required=False, diff --git a/noaaplotter/utils/dataset.py b/noaaplotter/utils/dataset.py index aea6154..141c80e 100755 --- a/noaaplotter/utils/dataset.py +++ b/noaaplotter/utils/dataset.py @@ -8,20 +8,23 @@ # version: 2020-12-09 ######################## -import numpy as np import os +import time + +import numpy as np +import polars as pl + from .utils import * -numeric_only = True + +NUMERIC_ONLY = True + class NOAAPlotterDailySummariesDataset(object): """ This class/module creates nice plots of observed weather data from NOAA """ - def __init__(self, - input_filepath=None, - location=None, - remove_feb29=False): + def __init__(self, input_filepath=None, location=None, remove_feb29=False): self.input_switch = None self.input_filepath = input_filepath self.location = location @@ -30,9 +33,9 @@ def __init__(self, self.remove_feb29 = remove_feb29 self.data = None self._check_data_loading() - if self.input_switch == 'file': + if self.input_switch == "file": self._load_file() - elif self.input_switch == 'noaa_api': + elif self.input_switch == "noaa_api": self._load_noaa() self._validate_location() self._update_datatypes() @@ -45,7 +48,7 @@ def print_locations(self): """ Print all locations names """ - print(self.data['NAME'].unique()) + print(self.data["NAME"].unique()) def _check_data_loading(self): """ @@ -54,18 +57,23 @@ def _check_data_loading(self): * input_filepath """ if os.path.exists(self.input_filepath): - self.input_switch = 'file' + self.input_switch = "file" elif self.noaa_token and self.noaa_location: - self.input_switch = 'noaa_api' + self.input_switch = "noaa_api" else: - raise ImportError("Please enter either correct file path or noaa station_id and API token") + raise ImportError( + "Please enter either correct file path or noaa station_id and API token" + ) def _load_file(self): """ load csv file into Pandas DataFrame :return: """ - self.data = pd.read_csv(self.input_filepath) + data = pl.read_parquet(self.input_filepath).to_pandas() + if "__index_level_0__" in data.columns: + data = data.drop(columns=["__index_level_0__"]) + self.data = data def _load_noaa(self): """ @@ -83,34 +91,37 @@ def _validate_location(self): raise error and message if location name cannot be found :return: """ - if not self.location and len(pd.unique(self.data['NAME']) == 1): + if not self.location and len(pd.unique(self.data["NAME"]) == 1): pass - elif not self.location and len(pd.unique(self.data['NAME']) > 1): + elif not self.location and len(pd.unique(self.data["NAME"]) > 1): raise ValueError( - 'There is more than one location in the dataset. Please choose a location using the -loc option! ' - 'Valid Location identifiers: {0} ' - .format(self.data['NAME'].unique())) + "There is more than one location in the dataset. Please choose a location using the -loc option! " + "Valid Location identifiers: {0} ".format(self.data["NAME"].unique()) + ) else: - filt = self.data['NAME'].str.lower().str.contains(self.location.lower()) + filt = self.data["NAME"].str.lower().str.contains(self.location.lower()) if filt.sum() == 0: - raise ValueError('Location Name is not valid! Valid Location identifiers: {0}' - .format(self.data['NAME'].unique())) + raise ValueError( + "Location Name is not valid! Valid Location identifiers: {0}".format( + self.data["NAME"].unique() + ) + ) def _update_datatypes(self): """ define 'DATE' as datetime :return: """ - self.data['DATE'] = pd.to_datetime(self.data['DATE']) + self.data["DATE"] = pd.to_datetime(self.data["DATE"]) def _get_datestring(self): """ write specific date formats :return: """ - self.data['DATE_MD'] = self.data['DATE'].dt.strftime('%m-%d') - self.data['DATE_YM'] = self.data['DATE'].dt.strftime('%Y-%m') - self.data['DATE_M'] = self.data['DATE'].dt.strftime('%m') + self.data["DATE_MD"] = self.data["DATE"].dt.strftime("%m-%d") + self.data["DATE_YM"] = self.data["DATE"].dt.strftime("%Y-%m") + self.data["DATE_M"] = self.data["DATE"].dt.strftime("%m") def _get_tmean(self): """ @@ -118,7 +129,7 @@ def _get_tmean(self): :return: """ # TODO: check for cases where TMIN and TMAX are empty (e.g. Schonefeld). There TAVG is the main field - self.data['TMEAN'] = self.data[['TMIN', 'TMAX']].mean(axis=1) + self.data["TMEAN"] = self.data[["TMIN", "TMAX"]].mean(axis=1) def _remove_feb29(self): """ @@ -126,7 +137,7 @@ def _remove_feb29(self): :return: """ if self.remove_feb29: - self.data = self.data[self.data['DATE_MD'] != '02-29'] + self.data = self.data[self.data["DATE_MD"] != "02-29"] def _filter_to_location(self): """ @@ -134,18 +145,20 @@ def _filter_to_location(self): :return: """ if self.location: - filt = self.data['NAME'].str.lower().str.contains(self.location.lower()) + filt = self.data["NAME"].str.lower().str.contains(self.location.lower()) if len(filt) > 0: self.data = self.data.loc[filt] else: - raise ValueError('Location Name is not valid') + raise ValueError("Location Name is not valid") def filter_to_climate(self, climate_start, climate_end): """ Function to create filtered dataset covering the defined climate normal period :return: """ - df_clim = self.data[(self.data['DATE'] >= climate_start) & (self.data['DATE'] <= climate_end)] + df_clim = self.data[ + (self.data["DATE"] >= climate_start) & (self.data["DATE"] <= climate_end) + ] return df_clim @staticmethod @@ -157,15 +170,55 @@ def get_monthly_stats(df): :return: """ df_out = pd.DataFrame() - df_out['tmean_doy_mean'] = df[['DATE', 'TMEAN']].groupby(df['DATE_YM']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = df[['DATE', 'TMEAN']].groupby(df['DATE_YM']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = df[['DATE', 'TMAX']].groupby(df['DATE_YM']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = df[['DATE', 'TMAX']].groupby(df['DATE_YM']).std(numeric_only=numeric_only).TMAX - df_out['tmin_doy_min'] = df[['DATE', 'TMIN']].groupby(df['DATE_YM']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = df[['DATE', 'TMIN']].groupby(df['DATE_YM']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in df.columns: - df_out['snow_doy_mean'] = df[['DATE', 'SNOW']].groupby(df['DATE_YM']).mean(numeric_only=numeric_only).SNOW - df_out['prcp_sum'] = df[['DATE', 'PRCP']].groupby(df['DATE_YM']).sum(numeric_only=numeric_only).PRCP + df_out["tmean_doy_mean"] = ( + df[["DATE", "TMEAN"]] + .groupby(df["DATE_YM"]) + .mean(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + df[["DATE", "TMEAN"]] + .groupby(df["DATE_YM"]) + .std(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + df[["DATE", "TMAX"]] + .groupby(df["DATE_YM"]) + .max(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmax_doy_std"] = ( + df[["DATE", "TMAX"]] + .groupby(df["DATE_YM"]) + .std(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmin_doy_min"] = ( + df[["DATE", "TMIN"]] + .groupby(df["DATE_YM"]) + .min(numeric_only=NUMERIC_ONLY) + .TMIN + ) + df_out["tmin_doy_std"] = ( + df[["DATE", "TMIN"]] + .groupby(df["DATE_YM"]) + .std(numeric_only=NUMERIC_ONLY) + .TMIN + ) + if "SNOW" in df.columns: + df_out["snow_doy_mean"] = ( + df[["DATE", "SNOW"]] + .groupby(df["DATE_YM"]) + .mean(numeric_only=NUMERIC_ONLY) + .SNOW + ) + df_out["prcp_sum"] = ( + df[["DATE", "PRCP"]] + .groupby(df["DATE_YM"]) + .sum(numeric_only=NUMERIC_ONLY) + .PRCP + ) return df_out @staticmethod @@ -176,23 +229,75 @@ def get_monthy_climate(df): """ df_out = pd.DataFrame() df = df.data - df['Month'] = df.reset_index().apply(lambda x: int(x['DATE_MD'][:2]), axis=1).values - df_out['tmean_mean'] = df[['Month', 'TMEAN']].groupby(df['Month']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_std'] = df[['Month', 'TMEAN']].groupby(df['Month']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_max'] = df[['Month', 'TMAX']].groupby(df['Month']).max(numeric_only=numeric_only).TMAX - df_out['tmax_std'] = df[['Month', 'TMAX']].groupby(df['Month']).std(numeric_only=numeric_only).TMAX - df_out['tmin_min'] = df[['Month', 'TMIN']].groupby(df['Month']).min(numeric_only=numeric_only).TMIN - df_out['tmin_std'] = df[['Month', 'TMIN']].groupby(df['Month']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in df.columns: - df_out['snow_mean'] = df[['Month', 'SNOW']].groupby(df['Month']).mean(numeric_only=numeric_only).SNOW - unique_years = len(np.unique(df.apply(lambda x: parse_dates_YM(x['DATE_YM']).year, axis=1))) - df_out['prcp_mean'] = df[['Month', 'PRCP']].groupby(df['Month']).mean(numeric_only=numeric_only).PRCP * unique_years + df["Month"] = ( + df.reset_index().apply(lambda x: int(x["DATE_MD"][:2]), axis=1).values + ) + df_out["tmean_mean"] = ( + df[["Month", "TMEAN"]] + .groupby(df["Month"]) + .mean(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmean_std"] = ( + df[["Month", "TMEAN"]] + .groupby(df["Month"]) + .std(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmax_max"] = ( + df[["Month", "TMAX"]] + .groupby(df["Month"]) + .max(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmax_std"] = ( + df[["Month", "TMAX"]] + .groupby(df["Month"]) + .std(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmin_min"] = ( + df[["Month", "TMIN"]] + .groupby(df["Month"]) + .min(numeric_only=NUMERIC_ONLY) + .TMIN + ) + df_out["tmin_std"] = ( + df[["Month", "TMIN"]] + .groupby(df["Month"]) + .std(numeric_only=NUMERIC_ONLY) + .TMIN + ) + if "SNOW" in df.columns: + df_out["snow_mean"] = ( + df[["Month", "SNOW"]] + .groupby(df["Month"]) + .mean(numeric_only=NUMERIC_ONLY) + .SNOW + ) + unique_years = len( + np.unique(df.apply(lambda x: parse_dates_YM(x["DATE_YM"]).year, axis=1)) + ) + df_out["prcp_mean"] = ( + df[["Month", "PRCP"]] + .groupby(df["Month"]) + .mean(numeric_only=NUMERIC_ONLY) + .PRCP + * unique_years + ) return df_out.reset_index(drop=False) class NOAAPlotterDailyClimateDataset(object): # TODO: make main class sub subclasses for daily/monthly - def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', filtersize=7, impute_feb29=True): + def __init__( + self, + daily_dataset, + start="1981-01-01", + end="2010-12-31", + filtersize=7, + impute_feb29=True, + ): """ :param start: :param end: @@ -217,24 +322,30 @@ def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', filtersi # mean imputation for 29 February self._impute_feb29() # filter if desired + start_time = time.time() self._run_filter() + # self._run_filter_polars() + end_time = time.time() + print(f"_run_filter took {end_time - start_time:.2f} seconds to run.") # make completeness report def _validate_date_range(self): - if self.daily_dataset.data['DATE'].max() >= self.end: - if self.daily_dataset.data['DATE'].min() <= self.end: + if self.daily_dataset.data["DATE"].max() >= self.end: + if self.daily_dataset.data["DATE"].min() <= self.end: self.date_range_valid = True else: - raise ('Dataset is insufficient to calculate climate normals!') + raise ("Dataset is insufficient to calculate climate normals!") def _filter_to_climate(self): """ calculate climate dataset :return: """ - df_clim = self.daily_dataset.data[(self.daily_dataset.data['DATE'] >= self.start) & - (self.daily_dataset.data['DATE'] <= self.end)] - df_clim = df_clim[(df_clim['DATE_MD'] != '02-29')] + df_clim = self.daily_dataset.data[ + (self.daily_dataset.data["DATE"] >= self.start) + & (self.daily_dataset.data["DATE"] <= self.end) + ] + df_clim = df_clim[(df_clim["DATE_MD"] != "02-29")] self.data_daily = df_clim def _calculate_climate_statistics(self): @@ -245,16 +356,61 @@ def _calculate_climate_statistics(self): :return: """ df_out = pd.DataFrame() - df_out['tmean_doy_mean'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).std().TMEAN - df_out['tmean_doy_max'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).max(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_min'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).min(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = self.data_daily[['DATE', 'TMAX']].groupby(self.data_daily['DATE_MD']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = self.data_daily[['DATE', 'TMAX']].groupby(self.data_daily['DATE_MD']).std().TMAX - df_out['tmin_doy_min'] = self.data_daily[['DATE', 'TMIN']].groupby(self.data_daily['DATE_MD']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = self.data_daily[['DATE', 'TMIN']].groupby(self.data_daily['DATE_MD']).std().TMIN - if 'SNOW' in self.data_daily.columns: - df_out['snow_doy_mean'] = self.data_daily[['DATE', 'SNOW']].groupby(self.data_daily['DATE_MD']).mean(numeric_only=numeric_only).SNOW + df_out["tmean_doy_mean"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .mean(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .std() + .TMEAN + ) + df_out["tmean_doy_max"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .max(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmean_doy_min"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .min(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + self.data_daily[["DATE", "TMAX"]] + .groupby(self.data_daily["DATE_MD"]) + .max(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmax_doy_std"] = ( + self.data_daily[["DATE", "TMAX"]] + .groupby(self.data_daily["DATE_MD"]) + .std() + .TMAX + ) + df_out["tmin_doy_min"] = ( + self.data_daily[["DATE", "TMIN"]] + .groupby(self.data_daily["DATE_MD"]) + .min(numeric_only=NUMERIC_ONLY) + .TMIN + ) + df_out["tmin_doy_std"] = ( + self.data_daily[["DATE", "TMIN"]] + .groupby(self.data_daily["DATE_MD"]) + .std() + .TMIN + ) + if "SNOW" in self.data_daily.columns: + df_out["snow_doy_mean"] = ( + self.data_daily[["DATE", "SNOW"]] + .groupby(self.data_daily["DATE_MD"]) + .mean(numeric_only=NUMERIC_ONLY) + .SNOW + ) self.data = df_out def _impute_feb29(self): @@ -263,7 +419,7 @@ def _impute_feb29(self): :return: """ if self.impute_feb29: - self.data.loc['02-29'] = self.data.loc['02-28':'03-01'].mean(axis=0) + self.data.loc["02-29"] = self.data.loc["02-28":"03-01"].mean(axis=0) self.data.sort_index(inplace=True) def _run_filter(self): @@ -272,10 +428,47 @@ def _run_filter(self): :return: """ if self.filtersize % 2 != 0: - data_roll = pd.concat([self.data.iloc[-self.filtersize:], - self.data, - self.data[:self.filtersize]]).rolling(self.filtersize).mean() - self.data = data_roll[self.filtersize: -self.filtersize] + data_roll = ( + pd.concat( + [ + self.data.iloc[-self.filtersize :], + self.data, + self.data[: self.filtersize], + ] + ) + .rolling(self.filtersize) + .mean() + ) + self.data = data_roll[self.filtersize : -self.filtersize] + + # TODO: produces different reults than pandas version + def _run_filter_polars(self): + """ + Function to run rolling mean filter on climate series to smooth out short fluctuations using Polars + """ + if self.filtersize % 2 != 0: + # Convert pandas DataFrame to Polars DataFrame + idx = self.data.index + df = pl.from_pandas(self.data) + + # Prepare data for rolling operation + extended_df = pl.concat( + [df.tail(self.filtersize), df, df.head(self.filtersize)] + ) + + # Apply rolling mean + rolled = extended_df.select( + [pl.all().rolling_mean(window_size=self.filtersize)] + ) + + # Slice the result to match original data size + result = rolled.slice(self.filtersize, len(df)).to_pandas() + result.index = idx + + # Convert back to pandas DataFrame and update self.data + self.data = result + else: + raise ValueError("Filter size must be odd") def _make_report(self): """ @@ -287,7 +480,9 @@ def _make_report(self): class NOAAPlotterMonthlyClimateDataset(object): - def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', impute_feb29=True): + def __init__( + self, daily_dataset, start="1981-01-01", end="2010-12-31", impute_feb29=True + ): self.daily_dataset = daily_dataset self.monthly_aggregate = None self.start = parse_dates(start) @@ -296,20 +491,22 @@ def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', impute_f self._validate_date_range() def _validate_date_range(self): - if self.daily_dataset.data['DATE'].max() >= self.end: - if self.daily_dataset.data['DATE'].min() <= self.end: + if self.daily_dataset.data["DATE"].max() >= self.end: + if self.daily_dataset.data["DATE"].min() <= self.end: self.date_range_valid = True else: - raise ('Dataset is insufficient to calculate climate normals!') + raise ("Dataset is insufficient to calculate climate normals!") def _filter_to_climate(self): """ calculate climate dataset :return: """ - df_clim = self.daily_dataset.data[(self.daily_dataset.data['DATE'] >= self.start) & - (self.daily_dataset.data['DATE'] <= self.end)] - df_clim = df_clim[(df_clim['DATE_MD'] != '02-29')] + df_clim = self.daily_dataset.data[ + (self.daily_dataset.data["DATE"] >= self.start) + & (self.daily_dataset.data["DATE"] <= self.end) + ] + df_clim = df_clim[(df_clim["DATE_MD"] != "02-29")] self.data_daily = df_clim def filter_to_date(self): @@ -317,9 +514,11 @@ def filter_to_date(self): calculate climate dataset :return: """ - df_clim = self.daily_dataset.data[(self.daily_dataset.data['DATE'] >= self.start) & - (self.daily_dataset.data['DATE'] <= self.end)] - df_clim = df_clim[(df_clim['DATE_MD'] != '02-29')] + df_clim = self.daily_dataset.data[ + (self.daily_dataset.data["DATE"] >= self.start) + & (self.daily_dataset.data["DATE"] <= self.end) + ] + df_clim = df_clim[(df_clim["DATE_MD"] != "02-29")] return df_clim def _impute_feb29(self): @@ -337,15 +536,55 @@ def calculate_monthly_statistics(self): df_out = pd.DataFrame() data_filtered = self.filter_to_date() - df_out['tmean_doy_mean'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['DATE_YM']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['DATE_YM']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['DATE_YM']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['DATE_YM']).std(numeric_only=numeric_only).TMAX - df_out['tmin_doy_min'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['DATE_YM']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['DATE_YM']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in data_filtered.columns: - df_out['snow_doy_mean'] = data_filtered[['DATE', 'SNOW']].groupby(data_filtered['DATE_YM']).mean(numeric_only=numeric_only).SNOW - df_out['prcp_sum'] = data_filtered[['DATE', 'PRCP']].groupby(data_filtered['DATE_YM']).sum(numeric_only=numeric_only).PRCP + df_out["tmean_doy_mean"] = ( + data_filtered[["TMEAN"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.mean() if x.notna().any() else np.nan) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + data_filtered[["TMEAN"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.std() if x.notna().any() else np.nan) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + data_filtered[["TMAX"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.max() if x.notna().any() else np.nan) + .TMAX + ) + df_out["tmax_doy_std"] = ( + data_filtered[["TMAX"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.std() if x.notna().any() else np.nan) + .TMAX + ) + df_out["tmin_doy_min"] = ( + data_filtered[["TMIN"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.min() if x.notna().any() else np.nan) + .TMIN + ) + df_out["tmin_doy_std"] = ( + data_filtered[["TMIN"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.std() if x.notna().any() else np.nan) + .TMIN + ) + if "SNOW" in data_filtered.columns: + df_out["snow_doy_mean"] = ( + data_filtered[["SNOW"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.mean() if x.notna().any() else np.nan) + .SNOW + ) + df_out["prcp_sum"] = ( + data_filtered[["PRCP"]] + .groupby(data_filtered["DATE_YM"]) + .agg(lambda x: x.sum() if x.notna().any() else np.nan) + .PRCP + ) self.monthly_aggregate = df_out def calculate_monthly_climate(self): @@ -356,19 +595,66 @@ def calculate_monthly_climate(self): df_out = pd.DataFrame() data_filtered = self.filter_to_date() - data_filtered['DATE'] = data_filtered.apply(lambda x: parse_dates_YM(x['DATE_YM']), axis=1) - data_filtered['Month'] = data_filtered.apply(lambda x: parse_dates_YM(x['DATE_YM']).month, axis=1) - data_filtered['Year'] = data_filtered.apply(lambda x: parse_dates_YM(x['DATE_YM']).year, axis=1) - - df_out['tmean_doy_mean'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['Month']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['Month']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['Month']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['Month']).std(numeric_only=numeric_only).TMAX - df_out['tmin_doy_min'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['Month']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['Month']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in data_filtered.columns: - df_out['snow_doy_mean'] = data_filtered[['DATE', 'SNOW']].groupby(data_filtered['Month']).mean(numeric_only=numeric_only).SNOW - df_out['prcp_sum'] = data_filtered[['DATE', 'PRCP']].groupby(data_filtered['Month']).mean(numeric_only=numeric_only).PRCP * 30 + data_filtered["DATE"] = data_filtered.apply( + lambda x: parse_dates_YM(x["DATE_YM"]), axis=1 + ) + data_filtered["Month"] = data_filtered.apply( + lambda x: parse_dates_YM(x["DATE_YM"]).month, axis=1 + ) + data_filtered["Year"] = data_filtered.apply( + lambda x: parse_dates_YM(x["DATE_YM"]).year, axis=1 + ) + + df_out["tmean_doy_mean"] = ( + data_filtered[["DATE", "TMEAN"]] + .groupby(data_filtered["Month"]) + .mean(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + data_filtered[["DATE", "TMEAN"]] + .groupby(data_filtered["Month"]) + .std(numeric_only=NUMERIC_ONLY) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + data_filtered[["DATE", "TMAX"]] + .groupby(data_filtered["Month"]) + .max(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmax_doy_std"] = ( + data_filtered[["DATE", "TMAX"]] + .groupby(data_filtered["Month"]) + .std(numeric_only=NUMERIC_ONLY) + .TMAX + ) + df_out["tmin_doy_min"] = ( + data_filtered[["DATE", "TMIN"]] + .groupby(data_filtered["Month"]) + .min(numeric_only=NUMERIC_ONLY) + .TMIN + ) + df_out["tmin_doy_std"] = ( + data_filtered[["DATE", "TMIN"]] + .groupby(data_filtered["Month"]) + .std(numeric_only=NUMERIC_ONLY) + .TMIN + ) + if "SNOW" in data_filtered.columns: + df_out["snow_doy_mean"] = ( + data_filtered[["DATE", "SNOW"]] + .groupby(data_filtered["Month"]) + .mean(numeric_only=NUMERIC_ONLY) + .SNOW + ) + df_out["prcp_sum"] = ( + data_filtered[["DATE", "PRCP"]] + .groupby(data_filtered["Month"]) + .mean(numeric_only=NUMERIC_ONLY) + .PRCP + * 30 + ) # df_out = df_out.set_index('DATE_YM', drop=False) self.monthly_climate = df_out @@ -379,4 +665,4 @@ def _make_report(self): """ # input climate series (e.g. 1981-01-01 - 2010-12-31) - pass \ No newline at end of file + pass diff --git a/noaaplotter/utils/download_utils.py b/noaaplotter/utils/download_utils.py index 39f8e98..3591850 100755 --- a/noaaplotter/utils/download_utils.py +++ b/noaaplotter/utils/download_utils.py @@ -1,4 +1,3 @@ -import csv import datetime as dt import json import os @@ -8,6 +7,7 @@ import geemap import numpy as np import pandas as pd +import polars as pl import requests import tqdm from joblib import Parallel, delayed @@ -15,6 +15,7 @@ from noaaplotter.utils.utils import assign_numeric_datatypes +# move some logic outside def download_from_noaa( output_file, start_date, @@ -25,33 +26,61 @@ def download_from_noaa( noaa_api_token, n_jobs=4, ): - # remove file if exists + # Check if file exists and load it if os.path.exists(output_file): - os.remove(output_file) - # Make query string - dtypes_string = "&".join([f"datatypeid={dt}" for dt in datatypes]) - # convert datestring to dt + existing_df = pl.read_parquet(output_file).drop_nulls(subset='STATION') + existing_dates = set(existing_df['DATE'].to_list()) + else: + existing_df = None + existing_dates = set() + + # Convert datestrings to datetime dt_start = datetime.strptime(start_date, "%Y-%m-%d") dt_end = datetime.strptime(end_date, "%Y-%m-%d") - # calculate number of days - n_days = (dt_end - dt_start).days - # calculate number of splits to fit into 1000 lines/rows - split_size = np.floor(1000 / len(datatypes)) - # calculate splits - split_range = np.arange(0, n_days, split_size) + + # Calculate date range + all_dates = set(pd.date_range(start=dt_start, end=dt_end).strftime("%Y-%m-%d")) + missing_dates = sorted(list(all_dates - existing_dates)) + + if not missing_dates: + print("No new data to download.") + return 0 + + # Find contiguous date ranges to download + date_ranges = [] + range_start = missing_dates[0] + prev_date = datetime.strptime(missing_dates[0], "%Y-%m-%d") + + for date_str in missing_dates[1:] + [None]: # Add None to handle the last range + if date_str is None or datetime.strptime(date_str, "%Y-%m-%d") - prev_date > timedelta(days=1): + date_ranges.append((range_start, prev_date.strftime("%Y-%m-%d"))) + if date_str is not None: + range_start = date_str + prev_date = datetime.strptime(date_str, "%Y-%m-%d") if date_str else None + # Data Loading - print("Downloading data through NOAA API") - datasets_list = Parallel(n_jobs=n_jobs)( - delayed(dl_noaa_api)( - i, datatypes, station_id, noaa_api_token, start_date, end_date, split_size + print("Downloading missing data through NOAA API") + all_new_data = [] + + for start, end in date_ranges: + print(f"Downloading data from {start} to {end}") + n_days = (datetime.strptime(end, "%Y-%m-%d") - datetime.strptime(start, "%Y-%m-%d")).days + 1 + split_size = np.floor(1000 / len(datatypes)) + split_range = np.arange(0, n_days, split_size) + + datasets_list = Parallel(n_jobs=n_jobs)( + delayed(dl_noaa_api)( + i, datatypes, station_id, noaa_api_token, start, end, split_size + ) + for i in tqdm.tqdm(split_range[:]) ) - for i in tqdm.tqdm(split_range[:]) - ) - # drop empty/None from datasets_list - datasets_list = [i for i in datasets_list if i is not None] + + # Drop empty/None from datasets_list + datasets_list = [i for i in datasets_list if i is not None] + all_new_data.extend(datasets_list) # Merge subsets and create DataFrame - df = pd.concat(datasets_list) + df = pd.concat(all_new_data) df_pivot = assign_numeric_datatypes(df) df_pivot["DATE"] = df_pivot.apply( @@ -69,16 +98,24 @@ def download_from_noaa( ) df_merged["DATE"] = df_merged.index df_merged["NAME"] = loc_name - df_merged["TAVG"] = None - df_merged["SNWD"] = None + if "TAVG" not in df_merged.columns: + df_merged["TAVG"] = None + if "SNWD" not in df_merged.columns: + df_merged["SNWD"] = None final_cols = ["STATION", "NAME", "DATE", "PRCP", "SNWD", "TAVG", "TMAX", "TMIN"] df_final = df_merged[final_cols] df_final = df_final.replace({np.nan: None}) + + # Merge with existing data if it exists + if existing_df is not None: + df_final = pd.concat([existing_df.to_pandas(), df_final]).drop_duplicates(subset=["DATE"], keep="last") + print(f"Saving data to {output_file}") - df_final.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) + df_final.to_parquet(output_file) return 0 + def dl_noaa_api(i, dtypes, station_id, Token, date_start, date_end, split_size): """ function to download from NOAA API diff --git a/pyproject.toml b/pyproject.toml index fe5821a..5672168 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [build-system] -requires = ["setuptools>=42", "wheel"] +requires = ["setuptools>0.70", "wheel"] build-backend = "setuptools.build_meta" [project] name = "noaaplotter" -version = "0.5.3" +version = "0.5.4" description = "Package to plot fancy climate/weather data of NOAA" -requires-python = ">=3.9" +requires-python = ">=3.11" authors = [ { name = "Ingmar Nitze", email = "ingmar.nitze@awi.de" } ] @@ -22,7 +22,12 @@ dependencies = [ "requests", "joblib>=1.4", "tqdm>=4.67", - "geemap>=0.35" + "geemap>=0.35.1", + "polars>=1.18.0", + "pyarrow>=18.1.0", + "box>=0.1.5", + "setuptools>=75.6.0", + "narwhals>=1.20.1", ] [tool.setuptools.packages.find]