From 330db9484d95934f14253a13e8e7affe9ff87d50 Mon Sep 17 00:00:00 2001 From: initze Date: Wed, 1 Jan 2025 21:30:39 +0100 Subject: [PATCH 1/7] added requirements and changed data to parquet --- noaaplotter/scripts/download_data.py | 4 ++-- noaaplotter/utils/download_utils.py | 17 ++++++++++++++++- pyproject.toml | 10 +++++++--- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/noaaplotter/scripts/download_data.py b/noaaplotter/scripts/download_data.py index 2c9b7ea..d0778f3 100644 --- a/noaaplotter/scripts/download_data.py +++ b/noaaplotter/scripts/download_data.py @@ -19,8 +19,8 @@ def main(): dest="output_file", type=str, required=True, - default="data/data.csv", - help="csv file to save results", + default="data/parquet.csv", + help="parquet file to save results", ) parser.add_argument( diff --git a/noaaplotter/utils/download_utils.py b/noaaplotter/utils/download_utils.py index 39f8e98..8308738 100755 --- a/noaaplotter/utils/download_utils.py +++ b/noaaplotter/utils/download_utils.py @@ -15,6 +15,13 @@ from noaaplotter.utils.utils import assign_numeric_datatypes +import polars as pl +from datetime import datetime +import numpy as np +import tqdm +from joblib import Parallel, delayed +import os + def download_from_noaa( output_file, start_date, @@ -28,17 +35,23 @@ def download_from_noaa( # remove file if exists if os.path.exists(output_file): os.remove(output_file) + # Make query string dtypes_string = "&".join([f"datatypeid={dt}" for dt in datatypes]) + # convert datestring to dt dt_start = datetime.strptime(start_date, "%Y-%m-%d") dt_end = datetime.strptime(end_date, "%Y-%m-%d") + # calculate number of days n_days = (dt_end - dt_start).days + # calculate number of splits to fit into 1000 lines/rows split_size = np.floor(1000 / len(datatypes)) + # calculate splits split_range = np.arange(0, n_days, split_size) + # Data Loading print("Downloading data through NOAA API") datasets_list = Parallel(n_jobs=n_jobs)( @@ -47,6 +60,7 @@ def download_from_noaa( ) for i in tqdm.tqdm(split_range[:]) ) + # drop empty/None from datasets_list datasets_list = [i for i in datasets_list if i is not None] @@ -75,10 +89,11 @@ def download_from_noaa( df_final = df_merged[final_cols] df_final = df_final.replace({np.nan: None}) print(f"Saving data to {output_file}") - df_final.to_csv(output_file, index=False, quoting=csv.QUOTE_ALL) + df_final.to_parquet(output_file) return 0 + def dl_noaa_api(i, dtypes, station_id, Token, date_start, date_end, split_size): """ function to download from NOAA API diff --git a/pyproject.toml b/pyproject.toml index fe5821a..c9a3fbf 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,12 @@ [build-system] -requires = ["setuptools>=42", "wheel"] +requires = ["setuptools>0.70", "wheel"] build-backend = "setuptools.build_meta" [project] name = "noaaplotter" version = "0.5.3" description = "Package to plot fancy climate/weather data of NOAA" -requires-python = ">=3.9" +requires-python = ">=3.11" authors = [ { name = "Ingmar Nitze", email = "ingmar.nitze@awi.de" } ] @@ -22,7 +22,11 @@ dependencies = [ "requests", "joblib>=1.4", "tqdm>=4.67", - "geemap>=0.35" + "geemap>=0.35.1", + "polars>=1.18.0", + "pyarrow>=18.1.0", + "box>=0.1.5", + "setuptools>=75.6.0", ] [tool.setuptools.packages.find] From e3275ad83a606768f212d9e4c2ba693778cf7799 Mon Sep 17 00:00:00 2001 From: initze Date: Wed, 1 Jan 2025 21:56:12 +0100 Subject: [PATCH 2/7] changed to parquet default and data check --- noaaplotter/scripts/plot_daily.py | 2 +- noaaplotter/utils/download_utils.py | 89 +++++++++++++++++------------ 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/noaaplotter/scripts/plot_daily.py b/noaaplotter/scripts/plot_daily.py index b4dafd2..4387902 100644 --- a/noaaplotter/scripts/plot_daily.py +++ b/noaaplotter/scripts/plot_daily.py @@ -12,7 +12,7 @@ def main(): parser = argparse.ArgumentParser(description='Parse arguments.') parser.add_argument('-infile', dest='infile', type=str, required=True, - default='data/temp.csv', + default='data/temp.parquet', help='input file with climate data') parser.add_argument('-t', dest='token', type=str, required=False, diff --git a/noaaplotter/utils/download_utils.py b/noaaplotter/utils/download_utils.py index 8308738..4499dac 100755 --- a/noaaplotter/utils/download_utils.py +++ b/noaaplotter/utils/download_utils.py @@ -1,4 +1,3 @@ -import csv import datetime as dt import json import os @@ -8,6 +7,7 @@ import geemap import numpy as np import pandas as pd +import polars as pl import requests import tqdm from joblib import Parallel, delayed @@ -15,13 +15,6 @@ from noaaplotter.utils.utils import assign_numeric_datatypes -import polars as pl -from datetime import datetime -import numpy as np -import tqdm -from joblib import Parallel, delayed -import os - def download_from_noaa( output_file, start_date, @@ -32,40 +25,61 @@ def download_from_noaa( noaa_api_token, n_jobs=4, ): - # remove file if exists + # Check if file exists and load it if os.path.exists(output_file): - os.remove(output_file) - - # Make query string - dtypes_string = "&".join([f"datatypeid={dt}" for dt in datatypes]) - - # convert datestring to dt + existing_df = pl.read_parquet(output_file).drop_nulls(subset='STATION') + existing_dates = set(existing_df['DATE'].to_list()) + else: + existing_df = None + existing_dates = set() + + # Convert datestrings to datetime dt_start = datetime.strptime(start_date, "%Y-%m-%d") dt_end = datetime.strptime(end_date, "%Y-%m-%d") - - # calculate number of days - n_days = (dt_end - dt_start).days - - # calculate number of splits to fit into 1000 lines/rows - split_size = np.floor(1000 / len(datatypes)) - - # calculate splits - split_range = np.arange(0, n_days, split_size) - + + # Calculate date range + all_dates = set(pd.date_range(start=dt_start, end=dt_end).strftime("%Y-%m-%d")) + missing_dates = sorted(list(all_dates - existing_dates)) + + if not missing_dates: + print("No new data to download.") + return 0 + + # Find contiguous date ranges to download + date_ranges = [] + range_start = missing_dates[0] + prev_date = datetime.strptime(missing_dates[0], "%Y-%m-%d") + + for date_str in missing_dates[1:] + [None]: # Add None to handle the last range + if date_str is None or datetime.strptime(date_str, "%Y-%m-%d") - prev_date > timedelta(days=1): + date_ranges.append((range_start, prev_date.strftime("%Y-%m-%d"))) + if date_str is not None: + range_start = date_str + prev_date = datetime.strptime(date_str, "%Y-%m-%d") if date_str else None + # Data Loading - print("Downloading data through NOAA API") - datasets_list = Parallel(n_jobs=n_jobs)( - delayed(dl_noaa_api)( - i, datatypes, station_id, noaa_api_token, start_date, end_date, split_size + print("Downloading missing data through NOAA API") + all_new_data = [] + + for start, end in date_ranges: + print(f"Downloading data from {start} to {end}") + n_days = (datetime.strptime(end, "%Y-%m-%d") - datetime.strptime(start, "%Y-%m-%d")).days + 1 + split_size = np.floor(1000 / len(datatypes)) + split_range = np.arange(0, n_days, split_size) + + datasets_list = Parallel(n_jobs=n_jobs)( + delayed(dl_noaa_api)( + i, datatypes, station_id, noaa_api_token, start, end, split_size + ) + for i in tqdm.tqdm(split_range[:]) ) - for i in tqdm.tqdm(split_range[:]) - ) - - # drop empty/None from datasets_list - datasets_list = [i for i in datasets_list if i is not None] + + # Drop empty/None from datasets_list + datasets_list = [i for i in datasets_list if i is not None] + all_new_data.extend(datasets_list) # Merge subsets and create DataFrame - df = pd.concat(datasets_list) + df = pd.concat(all_new_data) df_pivot = assign_numeric_datatypes(df) df_pivot["DATE"] = df_pivot.apply( @@ -88,6 +102,11 @@ def download_from_noaa( final_cols = ["STATION", "NAME", "DATE", "PRCP", "SNWD", "TAVG", "TMAX", "TMIN"] df_final = df_merged[final_cols] df_final = df_final.replace({np.nan: None}) + + # Merge with existing data if it exists + if existing_df is not None: + df_final = pd.concat([existing_df.to_pandas(), df_final]).drop_duplicates(subset=["DATE"], keep="last") + print(f"Saving data to {output_file}") df_final.to_parquet(output_file) return 0 From eb416441f1de38429332fbbacd9bb2af97586018 Mon Sep 17 00:00:00 2001 From: initze Date: Wed, 1 Jan 2025 22:04:22 +0100 Subject: [PATCH 3/7] test for feature existance --- noaaplotter/utils/download_utils.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/noaaplotter/utils/download_utils.py b/noaaplotter/utils/download_utils.py index 4499dac..e194905 100755 --- a/noaaplotter/utils/download_utils.py +++ b/noaaplotter/utils/download_utils.py @@ -97,8 +97,10 @@ def download_from_noaa( ) df_merged["DATE"] = df_merged.index df_merged["NAME"] = loc_name - df_merged["TAVG"] = None - df_merged["SNWD"] = None + if "TAVG" not in df_merged.columns: + df_merged["TAVG"] = None + if "SNWD" not in df_merged.columns: + df_merged["SNWD"] = None final_cols = ["STATION", "NAME", "DATE", "PRCP", "SNWD", "TAVG", "TMAX", "TMIN"] df_final = df_merged[final_cols] df_final = df_final.replace({np.nan: None}) From a11b81eb5984a87fa17337de88841c20c71e1040 Mon Sep 17 00:00:00 2001 From: initze Date: Sun, 5 Jan 2025 10:22:57 +0100 Subject: [PATCH 4/7] added polars for rolling daily and some refactoring, added polars and narwhals as dependencies --- noaaplotter/utils/dataset.py | 494 ++++++++++++++++++++++------ noaaplotter/utils/download_utils.py | 1 + pyproject.toml | 1 + 3 files changed, 392 insertions(+), 104 deletions(-) diff --git a/noaaplotter/utils/dataset.py b/noaaplotter/utils/dataset.py index aea6154..53095a6 100755 --- a/noaaplotter/utils/dataset.py +++ b/noaaplotter/utils/dataset.py @@ -8,20 +8,23 @@ # version: 2020-12-09 ######################## -import numpy as np import os + +import numpy as np +import polars as pl +import time + from .utils import * + numeric_only = True + class NOAAPlotterDailySummariesDataset(object): """ This class/module creates nice plots of observed weather data from NOAA """ - def __init__(self, - input_filepath=None, - location=None, - remove_feb29=False): + def __init__(self, input_filepath=None, location=None, remove_feb29=False): self.input_switch = None self.input_filepath = input_filepath self.location = location @@ -30,9 +33,9 @@ def __init__(self, self.remove_feb29 = remove_feb29 self.data = None self._check_data_loading() - if self.input_switch == 'file': + if self.input_switch == "file": self._load_file() - elif self.input_switch == 'noaa_api': + elif self.input_switch == "noaa_api": self._load_noaa() self._validate_location() self._update_datatypes() @@ -45,7 +48,7 @@ def print_locations(self): """ Print all locations names """ - print(self.data['NAME'].unique()) + print(self.data["NAME"].unique()) def _check_data_loading(self): """ @@ -54,18 +57,23 @@ def _check_data_loading(self): * input_filepath """ if os.path.exists(self.input_filepath): - self.input_switch = 'file' + self.input_switch = "file" elif self.noaa_token and self.noaa_location: - self.input_switch = 'noaa_api' + self.input_switch = "noaa_api" else: - raise ImportError("Please enter either correct file path or noaa station_id and API token") + raise ImportError( + "Please enter either correct file path or noaa station_id and API token" + ) def _load_file(self): """ load csv file into Pandas DataFrame :return: """ - self.data = pd.read_csv(self.input_filepath) + data = pl.read_parquet(self.input_filepath).to_pandas() + if "__index_level_0__" in data.columns: + data = data.drop(columns=["__index_level_0__"]) + self.data = data def _load_noaa(self): """ @@ -83,34 +91,37 @@ def _validate_location(self): raise error and message if location name cannot be found :return: """ - if not self.location and len(pd.unique(self.data['NAME']) == 1): + if not self.location and len(pd.unique(self.data["NAME"]) == 1): pass - elif not self.location and len(pd.unique(self.data['NAME']) > 1): + elif not self.location and len(pd.unique(self.data["NAME"]) > 1): raise ValueError( - 'There is more than one location in the dataset. Please choose a location using the -loc option! ' - 'Valid Location identifiers: {0} ' - .format(self.data['NAME'].unique())) + "There is more than one location in the dataset. Please choose a location using the -loc option! " + "Valid Location identifiers: {0} ".format(self.data["NAME"].unique()) + ) else: - filt = self.data['NAME'].str.lower().str.contains(self.location.lower()) + filt = self.data["NAME"].str.lower().str.contains(self.location.lower()) if filt.sum() == 0: - raise ValueError('Location Name is not valid! Valid Location identifiers: {0}' - .format(self.data['NAME'].unique())) + raise ValueError( + "Location Name is not valid! Valid Location identifiers: {0}".format( + self.data["NAME"].unique() + ) + ) def _update_datatypes(self): """ define 'DATE' as datetime :return: """ - self.data['DATE'] = pd.to_datetime(self.data['DATE']) + self.data["DATE"] = pd.to_datetime(self.data["DATE"]) def _get_datestring(self): """ write specific date formats :return: """ - self.data['DATE_MD'] = self.data['DATE'].dt.strftime('%m-%d') - self.data['DATE_YM'] = self.data['DATE'].dt.strftime('%Y-%m') - self.data['DATE_M'] = self.data['DATE'].dt.strftime('%m') + self.data["DATE_MD"] = self.data["DATE"].dt.strftime("%m-%d") + self.data["DATE_YM"] = self.data["DATE"].dt.strftime("%Y-%m") + self.data["DATE_M"] = self.data["DATE"].dt.strftime("%m") def _get_tmean(self): """ @@ -118,7 +129,7 @@ def _get_tmean(self): :return: """ # TODO: check for cases where TMIN and TMAX are empty (e.g. Schonefeld). There TAVG is the main field - self.data['TMEAN'] = self.data[['TMIN', 'TMAX']].mean(axis=1) + self.data["TMEAN"] = self.data[["TMIN", "TMAX"]].mean(axis=1) def _remove_feb29(self): """ @@ -126,7 +137,7 @@ def _remove_feb29(self): :return: """ if self.remove_feb29: - self.data = self.data[self.data['DATE_MD'] != '02-29'] + self.data = self.data[self.data["DATE_MD"] != "02-29"] def _filter_to_location(self): """ @@ -134,18 +145,20 @@ def _filter_to_location(self): :return: """ if self.location: - filt = self.data['NAME'].str.lower().str.contains(self.location.lower()) + filt = self.data["NAME"].str.lower().str.contains(self.location.lower()) if len(filt) > 0: self.data = self.data.loc[filt] else: - raise ValueError('Location Name is not valid') + raise ValueError("Location Name is not valid") def filter_to_climate(self, climate_start, climate_end): """ Function to create filtered dataset covering the defined climate normal period :return: """ - df_clim = self.data[(self.data['DATE'] >= climate_start) & (self.data['DATE'] <= climate_end)] + df_clim = self.data[ + (self.data["DATE"] >= climate_start) & (self.data["DATE"] <= climate_end) + ] return df_clim @staticmethod @@ -157,15 +170,55 @@ def get_monthly_stats(df): :return: """ df_out = pd.DataFrame() - df_out['tmean_doy_mean'] = df[['DATE', 'TMEAN']].groupby(df['DATE_YM']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = df[['DATE', 'TMEAN']].groupby(df['DATE_YM']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = df[['DATE', 'TMAX']].groupby(df['DATE_YM']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = df[['DATE', 'TMAX']].groupby(df['DATE_YM']).std(numeric_only=numeric_only).TMAX - df_out['tmin_doy_min'] = df[['DATE', 'TMIN']].groupby(df['DATE_YM']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = df[['DATE', 'TMIN']].groupby(df['DATE_YM']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in df.columns: - df_out['snow_doy_mean'] = df[['DATE', 'SNOW']].groupby(df['DATE_YM']).mean(numeric_only=numeric_only).SNOW - df_out['prcp_sum'] = df[['DATE', 'PRCP']].groupby(df['DATE_YM']).sum(numeric_only=numeric_only).PRCP + df_out["tmean_doy_mean"] = ( + df[["DATE", "TMEAN"]] + .groupby(df["DATE_YM"]) + .mean(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + df[["DATE", "TMEAN"]] + .groupby(df["DATE_YM"]) + .std(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + df[["DATE", "TMAX"]] + .groupby(df["DATE_YM"]) + .max(numeric_only=numeric_only) + .TMAX + ) + df_out["tmax_doy_std"] = ( + df[["DATE", "TMAX"]] + .groupby(df["DATE_YM"]) + .std(numeric_only=numeric_only) + .TMAX + ) + df_out["tmin_doy_min"] = ( + df[["DATE", "TMIN"]] + .groupby(df["DATE_YM"]) + .min(numeric_only=numeric_only) + .TMIN + ) + df_out["tmin_doy_std"] = ( + df[["DATE", "TMIN"]] + .groupby(df["DATE_YM"]) + .std(numeric_only=numeric_only) + .TMIN + ) + if "SNOW" in df.columns: + df_out["snow_doy_mean"] = ( + df[["DATE", "SNOW"]] + .groupby(df["DATE_YM"]) + .mean(numeric_only=numeric_only) + .SNOW + ) + df_out["prcp_sum"] = ( + df[["DATE", "PRCP"]] + .groupby(df["DATE_YM"]) + .sum(numeric_only=numeric_only) + .PRCP + ) return df_out @staticmethod @@ -176,23 +229,75 @@ def get_monthy_climate(df): """ df_out = pd.DataFrame() df = df.data - df['Month'] = df.reset_index().apply(lambda x: int(x['DATE_MD'][:2]), axis=1).values - df_out['tmean_mean'] = df[['Month', 'TMEAN']].groupby(df['Month']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_std'] = df[['Month', 'TMEAN']].groupby(df['Month']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_max'] = df[['Month', 'TMAX']].groupby(df['Month']).max(numeric_only=numeric_only).TMAX - df_out['tmax_std'] = df[['Month', 'TMAX']].groupby(df['Month']).std(numeric_only=numeric_only).TMAX - df_out['tmin_min'] = df[['Month', 'TMIN']].groupby(df['Month']).min(numeric_only=numeric_only).TMIN - df_out['tmin_std'] = df[['Month', 'TMIN']].groupby(df['Month']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in df.columns: - df_out['snow_mean'] = df[['Month', 'SNOW']].groupby(df['Month']).mean(numeric_only=numeric_only).SNOW - unique_years = len(np.unique(df.apply(lambda x: parse_dates_YM(x['DATE_YM']).year, axis=1))) - df_out['prcp_mean'] = df[['Month', 'PRCP']].groupby(df['Month']).mean(numeric_only=numeric_only).PRCP * unique_years + df["Month"] = ( + df.reset_index().apply(lambda x: int(x["DATE_MD"][:2]), axis=1).values + ) + df_out["tmean_mean"] = ( + df[["Month", "TMEAN"]] + .groupby(df["Month"]) + .mean(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmean_std"] = ( + df[["Month", "TMEAN"]] + .groupby(df["Month"]) + .std(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmax_max"] = ( + df[["Month", "TMAX"]] + .groupby(df["Month"]) + .max(numeric_only=numeric_only) + .TMAX + ) + df_out["tmax_std"] = ( + df[["Month", "TMAX"]] + .groupby(df["Month"]) + .std(numeric_only=numeric_only) + .TMAX + ) + df_out["tmin_min"] = ( + df[["Month", "TMIN"]] + .groupby(df["Month"]) + .min(numeric_only=numeric_only) + .TMIN + ) + df_out["tmin_std"] = ( + df[["Month", "TMIN"]] + .groupby(df["Month"]) + .std(numeric_only=numeric_only) + .TMIN + ) + if "SNOW" in df.columns: + df_out["snow_mean"] = ( + df[["Month", "SNOW"]] + .groupby(df["Month"]) + .mean(numeric_only=numeric_only) + .SNOW + ) + unique_years = len( + np.unique(df.apply(lambda x: parse_dates_YM(x["DATE_YM"]).year, axis=1)) + ) + df_out["prcp_mean"] = ( + df[["Month", "PRCP"]] + .groupby(df["Month"]) + .mean(numeric_only=numeric_only) + .PRCP + * unique_years + ) return df_out.reset_index(drop=False) class NOAAPlotterDailyClimateDataset(object): # TODO: make main class sub subclasses for daily/monthly - def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', filtersize=7, impute_feb29=True): + def __init__( + self, + daily_dataset, + start="1981-01-01", + end="2010-12-31", + filtersize=7, + impute_feb29=True, + ): """ :param start: :param end: @@ -217,24 +322,30 @@ def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', filtersi # mean imputation for 29 February self._impute_feb29() # filter if desired + start_time = time.time() self._run_filter() + # self._run_filter_polars() + end_time = time.time() + print(f"_run_filter took {end_time - start_time:.2f} seconds to run.") # make completeness report def _validate_date_range(self): - if self.daily_dataset.data['DATE'].max() >= self.end: - if self.daily_dataset.data['DATE'].min() <= self.end: + if self.daily_dataset.data["DATE"].max() >= self.end: + if self.daily_dataset.data["DATE"].min() <= self.end: self.date_range_valid = True else: - raise ('Dataset is insufficient to calculate climate normals!') + raise ("Dataset is insufficient to calculate climate normals!") def _filter_to_climate(self): """ calculate climate dataset :return: """ - df_clim = self.daily_dataset.data[(self.daily_dataset.data['DATE'] >= self.start) & - (self.daily_dataset.data['DATE'] <= self.end)] - df_clim = df_clim[(df_clim['DATE_MD'] != '02-29')] + df_clim = self.daily_dataset.data[ + (self.daily_dataset.data["DATE"] >= self.start) + & (self.daily_dataset.data["DATE"] <= self.end) + ] + df_clim = df_clim[(df_clim["DATE_MD"] != "02-29")] self.data_daily = df_clim def _calculate_climate_statistics(self): @@ -245,16 +356,61 @@ def _calculate_climate_statistics(self): :return: """ df_out = pd.DataFrame() - df_out['tmean_doy_mean'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).std().TMEAN - df_out['tmean_doy_max'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).max(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_min'] = self.data_daily[['DATE', 'TMEAN']].groupby(self.data_daily['DATE_MD']).min(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = self.data_daily[['DATE', 'TMAX']].groupby(self.data_daily['DATE_MD']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = self.data_daily[['DATE', 'TMAX']].groupby(self.data_daily['DATE_MD']).std().TMAX - df_out['tmin_doy_min'] = self.data_daily[['DATE', 'TMIN']].groupby(self.data_daily['DATE_MD']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = self.data_daily[['DATE', 'TMIN']].groupby(self.data_daily['DATE_MD']).std().TMIN - if 'SNOW' in self.data_daily.columns: - df_out['snow_doy_mean'] = self.data_daily[['DATE', 'SNOW']].groupby(self.data_daily['DATE_MD']).mean(numeric_only=numeric_only).SNOW + df_out["tmean_doy_mean"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .mean(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .std() + .TMEAN + ) + df_out["tmean_doy_max"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .max(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmean_doy_min"] = ( + self.data_daily[["DATE", "TMEAN"]] + .groupby(self.data_daily["DATE_MD"]) + .min(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + self.data_daily[["DATE", "TMAX"]] + .groupby(self.data_daily["DATE_MD"]) + .max(numeric_only=numeric_only) + .TMAX + ) + df_out["tmax_doy_std"] = ( + self.data_daily[["DATE", "TMAX"]] + .groupby(self.data_daily["DATE_MD"]) + .std() + .TMAX + ) + df_out["tmin_doy_min"] = ( + self.data_daily[["DATE", "TMIN"]] + .groupby(self.data_daily["DATE_MD"]) + .min(numeric_only=numeric_only) + .TMIN + ) + df_out["tmin_doy_std"] = ( + self.data_daily[["DATE", "TMIN"]] + .groupby(self.data_daily["DATE_MD"]) + .std() + .TMIN + ) + if "SNOW" in self.data_daily.columns: + df_out["snow_doy_mean"] = ( + self.data_daily[["DATE", "SNOW"]] + .groupby(self.data_daily["DATE_MD"]) + .mean(numeric_only=numeric_only) + .SNOW + ) self.data = df_out def _impute_feb29(self): @@ -263,7 +419,7 @@ def _impute_feb29(self): :return: """ if self.impute_feb29: - self.data.loc['02-29'] = self.data.loc['02-28':'03-01'].mean(axis=0) + self.data.loc["02-29"] = self.data.loc["02-28":"03-01"].mean(axis=0) self.data.sort_index(inplace=True) def _run_filter(self): @@ -272,10 +428,47 @@ def _run_filter(self): :return: """ if self.filtersize % 2 != 0: - data_roll = pd.concat([self.data.iloc[-self.filtersize:], - self.data, - self.data[:self.filtersize]]).rolling(self.filtersize).mean() - self.data = data_roll[self.filtersize: -self.filtersize] + data_roll = ( + pd.concat( + [ + self.data.iloc[-self.filtersize :], + self.data, + self.data[: self.filtersize], + ] + ) + .rolling(self.filtersize) + .mean() + ) + self.data = data_roll[self.filtersize : -self.filtersize] + + # TODO: produces different reults than pandas version + def _run_filter_polars(self): + """ + Function to run rolling mean filter on climate series to smooth out short fluctuations using Polars + """ + if self.filtersize % 2 != 0: + # Convert pandas DataFrame to Polars DataFrame + idx = self.data.index + df = pl.from_pandas(self.data) + + # Prepare data for rolling operation + extended_df = pl.concat( + [df.tail(self.filtersize), df, df.head(self.filtersize)] + ) + + # Apply rolling mean + rolled = extended_df.select( + [pl.all().rolling_mean(window_size=self.filtersize)] + ) + + # Slice the result to match original data size + result = rolled.slice(self.filtersize, len(df)).to_pandas() + result.index = idx + + # Convert back to pandas DataFrame and update self.data + self.data = result + else: + raise ValueError("Filter size must be odd") def _make_report(self): """ @@ -287,7 +480,9 @@ def _make_report(self): class NOAAPlotterMonthlyClimateDataset(object): - def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', impute_feb29=True): + def __init__( + self, daily_dataset, start="1981-01-01", end="2010-12-31", impute_feb29=True + ): self.daily_dataset = daily_dataset self.monthly_aggregate = None self.start = parse_dates(start) @@ -296,20 +491,22 @@ def __init__(self, daily_dataset, start='1981-01-01', end='2010-12-31', impute_f self._validate_date_range() def _validate_date_range(self): - if self.daily_dataset.data['DATE'].max() >= self.end: - if self.daily_dataset.data['DATE'].min() <= self.end: + if self.daily_dataset.data["DATE"].max() >= self.end: + if self.daily_dataset.data["DATE"].min() <= self.end: self.date_range_valid = True else: - raise ('Dataset is insufficient to calculate climate normals!') + raise ("Dataset is insufficient to calculate climate normals!") def _filter_to_climate(self): """ calculate climate dataset :return: """ - df_clim = self.daily_dataset.data[(self.daily_dataset.data['DATE'] >= self.start) & - (self.daily_dataset.data['DATE'] <= self.end)] - df_clim = df_clim[(df_clim['DATE_MD'] != '02-29')] + df_clim = self.daily_dataset.data[ + (self.daily_dataset.data["DATE"] >= self.start) + & (self.daily_dataset.data["DATE"] <= self.end) + ] + df_clim = df_clim[(df_clim["DATE_MD"] != "02-29")] self.data_daily = df_clim def filter_to_date(self): @@ -317,9 +514,11 @@ def filter_to_date(self): calculate climate dataset :return: """ - df_clim = self.daily_dataset.data[(self.daily_dataset.data['DATE'] >= self.start) & - (self.daily_dataset.data['DATE'] <= self.end)] - df_clim = df_clim[(df_clim['DATE_MD'] != '02-29')] + df_clim = self.daily_dataset.data[ + (self.daily_dataset.data["DATE"] >= self.start) + & (self.daily_dataset.data["DATE"] <= self.end) + ] + df_clim = df_clim[(df_clim["DATE_MD"] != "02-29")] return df_clim def _impute_feb29(self): @@ -337,15 +536,55 @@ def calculate_monthly_statistics(self): df_out = pd.DataFrame() data_filtered = self.filter_to_date() - df_out['tmean_doy_mean'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['DATE_YM']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['DATE_YM']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['DATE_YM']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['DATE_YM']).std(numeric_only=numeric_only).TMAX - df_out['tmin_doy_min'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['DATE_YM']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['DATE_YM']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in data_filtered.columns: - df_out['snow_doy_mean'] = data_filtered[['DATE', 'SNOW']].groupby(data_filtered['DATE_YM']).mean(numeric_only=numeric_only).SNOW - df_out['prcp_sum'] = data_filtered[['DATE', 'PRCP']].groupby(data_filtered['DATE_YM']).sum(numeric_only=numeric_only).PRCP + df_out["tmean_doy_mean"] = ( + data_filtered[["DATE", "TMEAN"]] + .groupby(data_filtered["DATE_YM"]) + .mean(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + data_filtered[["DATE", "TMEAN"]] + .groupby(data_filtered["DATE_YM"]) + .std(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + data_filtered[["DATE", "TMAX"]] + .groupby(data_filtered["DATE_YM"]) + .max(numeric_only=numeric_only) + .TMAX + ) + df_out["tmax_doy_std"] = ( + data_filtered[["DATE", "TMAX"]] + .groupby(data_filtered["DATE_YM"]) + .std(numeric_only=numeric_only) + .TMAX + ) + df_out["tmin_doy_min"] = ( + data_filtered[["DATE", "TMIN"]] + .groupby(data_filtered["DATE_YM"]) + .min(numeric_only=numeric_only) + .TMIN + ) + df_out["tmin_doy_std"] = ( + data_filtered[["DATE", "TMIN"]] + .groupby(data_filtered["DATE_YM"]) + .std(numeric_only=numeric_only) + .TMIN + ) + if "SNOW" in data_filtered.columns: + df_out["snow_doy_mean"] = ( + data_filtered[["DATE", "SNOW"]] + .groupby(data_filtered["DATE_YM"]) + .mean(numeric_only=numeric_only) + .SNOW + ) + df_out["prcp_sum"] = ( + data_filtered[["DATE", "PRCP"]] + .groupby(data_filtered["DATE_YM"]) + .sum(numeric_only=numeric_only) + .PRCP + ) self.monthly_aggregate = df_out def calculate_monthly_climate(self): @@ -356,19 +595,66 @@ def calculate_monthly_climate(self): df_out = pd.DataFrame() data_filtered = self.filter_to_date() - data_filtered['DATE'] = data_filtered.apply(lambda x: parse_dates_YM(x['DATE_YM']), axis=1) - data_filtered['Month'] = data_filtered.apply(lambda x: parse_dates_YM(x['DATE_YM']).month, axis=1) - data_filtered['Year'] = data_filtered.apply(lambda x: parse_dates_YM(x['DATE_YM']).year, axis=1) - - df_out['tmean_doy_mean'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['Month']).mean(numeric_only=numeric_only).TMEAN - df_out['tmean_doy_std'] = data_filtered[['DATE', 'TMEAN']].groupby(data_filtered['Month']).std(numeric_only=numeric_only).TMEAN - df_out['tmax_doy_max'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['Month']).max(numeric_only=numeric_only).TMAX - df_out['tmax_doy_std'] = data_filtered[['DATE', 'TMAX']].groupby(data_filtered['Month']).std(numeric_only=numeric_only).TMAX - df_out['tmin_doy_min'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['Month']).min(numeric_only=numeric_only).TMIN - df_out['tmin_doy_std'] = data_filtered[['DATE', 'TMIN']].groupby(data_filtered['Month']).std(numeric_only=numeric_only).TMIN - if 'SNOW' in data_filtered.columns: - df_out['snow_doy_mean'] = data_filtered[['DATE', 'SNOW']].groupby(data_filtered['Month']).mean(numeric_only=numeric_only).SNOW - df_out['prcp_sum'] = data_filtered[['DATE', 'PRCP']].groupby(data_filtered['Month']).mean(numeric_only=numeric_only).PRCP * 30 + data_filtered["DATE"] = data_filtered.apply( + lambda x: parse_dates_YM(x["DATE_YM"]), axis=1 + ) + data_filtered["Month"] = data_filtered.apply( + lambda x: parse_dates_YM(x["DATE_YM"]).month, axis=1 + ) + data_filtered["Year"] = data_filtered.apply( + lambda x: parse_dates_YM(x["DATE_YM"]).year, axis=1 + ) + + df_out["tmean_doy_mean"] = ( + data_filtered[["DATE", "TMEAN"]] + .groupby(data_filtered["Month"]) + .mean(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmean_doy_std"] = ( + data_filtered[["DATE", "TMEAN"]] + .groupby(data_filtered["Month"]) + .std(numeric_only=numeric_only) + .TMEAN + ) + df_out["tmax_doy_max"] = ( + data_filtered[["DATE", "TMAX"]] + .groupby(data_filtered["Month"]) + .max(numeric_only=numeric_only) + .TMAX + ) + df_out["tmax_doy_std"] = ( + data_filtered[["DATE", "TMAX"]] + .groupby(data_filtered["Month"]) + .std(numeric_only=numeric_only) + .TMAX + ) + df_out["tmin_doy_min"] = ( + data_filtered[["DATE", "TMIN"]] + .groupby(data_filtered["Month"]) + .min(numeric_only=numeric_only) + .TMIN + ) + df_out["tmin_doy_std"] = ( + data_filtered[["DATE", "TMIN"]] + .groupby(data_filtered["Month"]) + .std(numeric_only=numeric_only) + .TMIN + ) + if "SNOW" in data_filtered.columns: + df_out["snow_doy_mean"] = ( + data_filtered[["DATE", "SNOW"]] + .groupby(data_filtered["Month"]) + .mean(numeric_only=numeric_only) + .SNOW + ) + df_out["prcp_sum"] = ( + data_filtered[["DATE", "PRCP"]] + .groupby(data_filtered["Month"]) + .mean(numeric_only=numeric_only) + .PRCP + * 30 + ) # df_out = df_out.set_index('DATE_YM', drop=False) self.monthly_climate = df_out @@ -379,4 +665,4 @@ def _make_report(self): """ # input climate series (e.g. 1981-01-01 - 2010-12-31) - pass \ No newline at end of file + pass diff --git a/noaaplotter/utils/download_utils.py b/noaaplotter/utils/download_utils.py index e194905..3591850 100755 --- a/noaaplotter/utils/download_utils.py +++ b/noaaplotter/utils/download_utils.py @@ -15,6 +15,7 @@ from noaaplotter.utils.utils import assign_numeric_datatypes +# move some logic outside def download_from_noaa( output_file, start_date, diff --git a/pyproject.toml b/pyproject.toml index c9a3fbf..d64384b 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ dependencies = [ "pyarrow>=18.1.0", "box>=0.1.5", "setuptools>=75.6.0", + "narwhals>=1.20.1", ] [tool.setuptools.packages.find] From 2045828252913a9a574082aef2e0484a6db52fbe Mon Sep 17 00:00:00 2001 From: initze Date: Sun, 5 Jan 2025 14:51:51 +0100 Subject: [PATCH 5/7] fixed no data perecipitation issues --- noaaplotter/utils/dataset.py | 80 ++++++++++++++++++------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/noaaplotter/utils/dataset.py b/noaaplotter/utils/dataset.py index 53095a6..b14e980 100755 --- a/noaaplotter/utils/dataset.py +++ b/noaaplotter/utils/dataset.py @@ -16,7 +16,7 @@ from .utils import * -numeric_only = True +NUMERIC_ONLY = True class NOAAPlotterDailySummariesDataset(object): @@ -173,50 +173,50 @@ def get_monthly_stats(df): df_out["tmean_doy_mean"] = ( df[["DATE", "TMEAN"]] .groupby(df["DATE_YM"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmean_doy_std"] = ( df[["DATE", "TMEAN"]] .groupby(df["DATE_YM"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmax_doy_max"] = ( df[["DATE", "TMAX"]] .groupby(df["DATE_YM"]) - .max(numeric_only=numeric_only) + .max(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmax_doy_std"] = ( df[["DATE", "TMAX"]] .groupby(df["DATE_YM"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmin_doy_min"] = ( df[["DATE", "TMIN"]] .groupby(df["DATE_YM"]) - .min(numeric_only=numeric_only) + .min(numeric_only=NUMERIC_ONLY) .TMIN ) df_out["tmin_doy_std"] = ( df[["DATE", "TMIN"]] .groupby(df["DATE_YM"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMIN ) if "SNOW" in df.columns: df_out["snow_doy_mean"] = ( df[["DATE", "SNOW"]] .groupby(df["DATE_YM"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .SNOW ) df_out["prcp_sum"] = ( df[["DATE", "PRCP"]] .groupby(df["DATE_YM"]) - .sum(numeric_only=numeric_only) + .sum(numeric_only=NUMERIC_ONLY) .PRCP ) return df_out @@ -235,44 +235,44 @@ def get_monthy_climate(df): df_out["tmean_mean"] = ( df[["Month", "TMEAN"]] .groupby(df["Month"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmean_std"] = ( df[["Month", "TMEAN"]] .groupby(df["Month"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmax_max"] = ( df[["Month", "TMAX"]] .groupby(df["Month"]) - .max(numeric_only=numeric_only) + .max(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmax_std"] = ( df[["Month", "TMAX"]] .groupby(df["Month"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmin_min"] = ( df[["Month", "TMIN"]] .groupby(df["Month"]) - .min(numeric_only=numeric_only) + .min(numeric_only=NUMERIC_ONLY) .TMIN ) df_out["tmin_std"] = ( df[["Month", "TMIN"]] .groupby(df["Month"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMIN ) if "SNOW" in df.columns: df_out["snow_mean"] = ( df[["Month", "SNOW"]] .groupby(df["Month"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .SNOW ) unique_years = len( @@ -281,7 +281,7 @@ def get_monthy_climate(df): df_out["prcp_mean"] = ( df[["Month", "PRCP"]] .groupby(df["Month"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .PRCP * unique_years ) @@ -359,7 +359,7 @@ def _calculate_climate_statistics(self): df_out["tmean_doy_mean"] = ( self.data_daily[["DATE", "TMEAN"]] .groupby(self.data_daily["DATE_MD"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmean_doy_std"] = ( @@ -371,19 +371,19 @@ def _calculate_climate_statistics(self): df_out["tmean_doy_max"] = ( self.data_daily[["DATE", "TMEAN"]] .groupby(self.data_daily["DATE_MD"]) - .max(numeric_only=numeric_only) + .max(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmean_doy_min"] = ( self.data_daily[["DATE", "TMEAN"]] .groupby(self.data_daily["DATE_MD"]) - .min(numeric_only=numeric_only) + .min(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmax_doy_max"] = ( self.data_daily[["DATE", "TMAX"]] .groupby(self.data_daily["DATE_MD"]) - .max(numeric_only=numeric_only) + .max(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmax_doy_std"] = ( @@ -395,7 +395,7 @@ def _calculate_climate_statistics(self): df_out["tmin_doy_min"] = ( self.data_daily[["DATE", "TMIN"]] .groupby(self.data_daily["DATE_MD"]) - .min(numeric_only=numeric_only) + .min(numeric_only=NUMERIC_ONLY) .TMIN ) df_out["tmin_doy_std"] = ( @@ -408,7 +408,7 @@ def _calculate_climate_statistics(self): df_out["snow_doy_mean"] = ( self.data_daily[["DATE", "SNOW"]] .groupby(self.data_daily["DATE_MD"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .SNOW ) self.data = df_out @@ -539,50 +539,50 @@ def calculate_monthly_statistics(self): df_out["tmean_doy_mean"] = ( data_filtered[["DATE", "TMEAN"]] .groupby(data_filtered["DATE_YM"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmean_doy_std"] = ( data_filtered[["DATE", "TMEAN"]] .groupby(data_filtered["DATE_YM"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmax_doy_max"] = ( data_filtered[["DATE", "TMAX"]] .groupby(data_filtered["DATE_YM"]) - .max(numeric_only=numeric_only) + .max(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmax_doy_std"] = ( data_filtered[["DATE", "TMAX"]] .groupby(data_filtered["DATE_YM"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmin_doy_min"] = ( data_filtered[["DATE", "TMIN"]] .groupby(data_filtered["DATE_YM"]) - .min(numeric_only=numeric_only) + .min(numeric_only=NUMERIC_ONLY) .TMIN ) df_out["tmin_doy_std"] = ( data_filtered[["DATE", "TMIN"]] .groupby(data_filtered["DATE_YM"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMIN ) if "SNOW" in data_filtered.columns: df_out["snow_doy_mean"] = ( data_filtered[["DATE", "SNOW"]] .groupby(data_filtered["DATE_YM"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .SNOW ) df_out["prcp_sum"] = ( - data_filtered[["DATE", "PRCP"]] + data_filtered[["PRCP"]] .groupby(data_filtered["DATE_YM"]) - .sum(numeric_only=numeric_only) + .agg(lambda x: x.sum() if x.notna().any() else np.nan) .PRCP ) self.monthly_aggregate = df_out @@ -608,50 +608,50 @@ def calculate_monthly_climate(self): df_out["tmean_doy_mean"] = ( data_filtered[["DATE", "TMEAN"]] .groupby(data_filtered["Month"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmean_doy_std"] = ( data_filtered[["DATE", "TMEAN"]] .groupby(data_filtered["Month"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMEAN ) df_out["tmax_doy_max"] = ( data_filtered[["DATE", "TMAX"]] .groupby(data_filtered["Month"]) - .max(numeric_only=numeric_only) + .max(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmax_doy_std"] = ( data_filtered[["DATE", "TMAX"]] .groupby(data_filtered["Month"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMAX ) df_out["tmin_doy_min"] = ( data_filtered[["DATE", "TMIN"]] .groupby(data_filtered["Month"]) - .min(numeric_only=numeric_only) + .min(numeric_only=NUMERIC_ONLY) .TMIN ) df_out["tmin_doy_std"] = ( data_filtered[["DATE", "TMIN"]] .groupby(data_filtered["Month"]) - .std(numeric_only=numeric_only) + .std(numeric_only=NUMERIC_ONLY) .TMIN ) if "SNOW" in data_filtered.columns: df_out["snow_doy_mean"] = ( data_filtered[["DATE", "SNOW"]] .groupby(data_filtered["Month"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .SNOW ) df_out["prcp_sum"] = ( data_filtered[["DATE", "PRCP"]] .groupby(data_filtered["Month"]) - .mean(numeric_only=numeric_only) + .mean(numeric_only=NUMERIC_ONLY) .PRCP * 30 ) From 434aea6fb326a931453bedbf802164a3ee0c2b60 Mon Sep 17 00:00:00 2001 From: initze Date: Sun, 5 Jan 2025 15:02:13 +0100 Subject: [PATCH 6/7] updated other features as well --- noaaplotter/utils/dataset.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/noaaplotter/utils/dataset.py b/noaaplotter/utils/dataset.py index b14e980..141c80e 100755 --- a/noaaplotter/utils/dataset.py +++ b/noaaplotter/utils/dataset.py @@ -9,10 +9,10 @@ ######################## import os +import time import numpy as np import polars as pl -import time from .utils import * @@ -537,46 +537,46 @@ def calculate_monthly_statistics(self): df_out = pd.DataFrame() data_filtered = self.filter_to_date() df_out["tmean_doy_mean"] = ( - data_filtered[["DATE", "TMEAN"]] + data_filtered[["TMEAN"]] .groupby(data_filtered["DATE_YM"]) - .mean(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.mean() if x.notna().any() else np.nan) .TMEAN ) df_out["tmean_doy_std"] = ( - data_filtered[["DATE", "TMEAN"]] + data_filtered[["TMEAN"]] .groupby(data_filtered["DATE_YM"]) - .std(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.std() if x.notna().any() else np.nan) .TMEAN ) df_out["tmax_doy_max"] = ( - data_filtered[["DATE", "TMAX"]] + data_filtered[["TMAX"]] .groupby(data_filtered["DATE_YM"]) - .max(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.max() if x.notna().any() else np.nan) .TMAX ) df_out["tmax_doy_std"] = ( - data_filtered[["DATE", "TMAX"]] + data_filtered[["TMAX"]] .groupby(data_filtered["DATE_YM"]) - .std(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.std() if x.notna().any() else np.nan) .TMAX ) df_out["tmin_doy_min"] = ( - data_filtered[["DATE", "TMIN"]] + data_filtered[["TMIN"]] .groupby(data_filtered["DATE_YM"]) - .min(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.min() if x.notna().any() else np.nan) .TMIN ) df_out["tmin_doy_std"] = ( - data_filtered[["DATE", "TMIN"]] + data_filtered[["TMIN"]] .groupby(data_filtered["DATE_YM"]) - .std(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.std() if x.notna().any() else np.nan) .TMIN ) if "SNOW" in data_filtered.columns: df_out["snow_doy_mean"] = ( - data_filtered[["DATE", "SNOW"]] + data_filtered[["SNOW"]] .groupby(data_filtered["DATE_YM"]) - .mean(numeric_only=NUMERIC_ONLY) + .agg(lambda x: x.mean() if x.notna().any() else np.nan) .SNOW ) df_out["prcp_sum"] = ( From 1bfe255b7a4c87b0730b129a05b06d20d9347ef5 Mon Sep 17 00:00:00 2001 From: initze Date: Sun, 5 Jan 2025 15:05:31 +0100 Subject: [PATCH 7/7] version bump --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e05409..151fdcf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# [0.5.4] - 2025-01-05 +### Changed +* fixed streamlit crash +* added toml for install +* fixed accounted for nan in monthly aggregates + # [0.5.1] - 2023-02-18 ### Changed * created download_utils diff --git a/pyproject.toml b/pyproject.toml index d64384b..5672168 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "noaaplotter" -version = "0.5.3" +version = "0.5.4" description = "Package to plot fancy climate/weather data of NOAA" requires-python = ">=3.11" authors = [