auto_transformation

Automatic Encoding for Categorical Variables

To encode calendar features as categorical variables, we can use any suitable encoding method from the sklearn.preprocessing module, such as OneHotEncoder, TargetEncoder, or OrdinalEncoder.

In three examples, we will use OneHotEncoder to encode the categorical variables in our dataset. This method creates new binary columns for each category in the original variable.

from sklearn.preprocessing import OneHotEncoder
import numpy as np
import pandas as pd
from peshbeen.models import ml_forecaster
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

date_range = pd.date_range(start='2020-01-01', periods=720, freq='D')
# create a non-stationary arbitrary flower sales data with an upward trend, weekly seasonality, and yearly seasonality
np.random.seed(42)
data = 30 + 0.07 * np.arange(720) + 10 * np.sin(2 * np.pi * date_range.dayofyear / 7) + 10 * np.sin(2 * np.pi * date_range.dayofyear / 365) + np.random.normal(0, 5, 720)

sales_data = pd.DataFrame(data, index=date_range, columns=['sales'])
sales_data["week_day"] = sales_data.index.dayofweek
sales_data["month"] = sales_data.index.month
cat_vars = ["week_day", "month"]
train = sales_data.iloc[:-30]
test = sales_data.iloc[-30:]

# Example of using OneHotEncoder for XGBoost
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore", drop="first")

xgb = ml_forecaster(target_col="sales", model=XGBRegressor(n_estimators=100, random_state=42),
                    lags = 6,
                    cat_variables=cat_vars, categorical_encoder=ohe)
xgb.fit(train)
xgb_forecasts = xgb.forecast(30, exog=test[cat_vars])

# How the transformed features look like for XGBoost
xgb.X.head()

	week_day_1	week_day_2	week_day_3	week_day_4	week_day_5	...	sales_lag_1	sales_lag_2	sales_lag_3	sales_lag_4	sales_lag_5	sales_lag_6
2020-01-07	1.0	0.0	0.0	0.0	0.0	...	22.392017	20.219602	34.174336	38.233477	39.472174	40.474019
2020-01-08	0.0	1.0	0.0	0.0	0.0	...	39.518145	22.392017	20.219602	34.174336	38.233477	39.472174
2020-01-09	0.0	0.0	1.0	0.0	0.0	...	43.518276	39.518145	22.392017	20.219602	34.174336	38.233477
2020-01-10	0.0	0.0	0.0	1.0	0.0	...	39.504995	43.518276	39.518145	22.392017	20.219602	34.174336
2020-01-11	0.0	0.0	0.0	0.0	1.0	...	39.394569	39.504995	43.518276	39.518145	22.392017	20.219602

5 rows × 23 columns

## Example of using TargetEncoder for LightGBM
from sklearn.preprocessing import TargetEncoder
te = TargetEncoder(cv=5)
lgb = ml_forecaster(target_col="sales", model=LGBMRegressor(n_estimators=100, random_state=42, verbose=-1),
                    lags = 6,
                    cat_variables=cat_vars, categorical_encoder=te)
lgb.fit(train)
lgb_forecasts = lgb.forecast(30, exog=test[cat_vars])

# How the transformed features look like for LightGBM
lgb.X.head()

	week_day	month	sales_lag_1	sales_lag_2	sales_lag_3	sales_lag_4	sales_lag_5	sales_lag_6
2020-01-07	50.118039	46.945004	22.392017	20.219602	34.174336	38.233477	39.472174	40.474019
2020-01-08	55.195593	47.235865	39.518145	22.392017	20.219602	34.174336	38.233477	39.472174
2020-01-09	59.591927	46.210753	43.518276	39.518145	22.392017	20.219602	34.174336	38.233477
2020-01-10	59.436795	46.478067	39.504995	43.518276	39.518145	22.392017	20.219602	34.174336
2020-01-11	57.761881	49.035528	39.394569	39.504995	43.518276	39.518145	22.392017	20.219602

Automatic Transformations for Rolling Window Features

peshbeen supports user-specified rolling window features — such as rolling means and standard deviations — which can be particularly useful for ML regressors as they capture recent dynamics in the series. Beyond feature engineering, peshbeen can automatically apply a Box-Cox transformation to the target variable when the data exhibits heteroscedasticity, stabilising variance before model fitting and improving forecast reliability.

import matplotlib.pyplot as plt
from peshbeen.transformations import rolling_mean, rolling_quantile, rolling_std, expanding_mean
from peshbeen.models import ml_forecaster
from sklearn.linear_model import LinearRegression

transformations = [rolling_std(window_size=30, shift=1), rolling_mean(window_size=30, shift=7),
                   rolling_quantile(window_size=30, shift=1, quantile=0.25),
                   rolling_quantile(window_size=30, shift=1, quantile=0.75), expanding_mean(shift=1)]
linear_model = ml_forecaster(model=LinearRegression(),
              target_col='sales', lags = 7, box_cox=0.5,
              lag_transform=transformations, cat_variables=cat_vars, categorical_encoder=ohe)
linear_model.fit(train)
# linear_model.data_prep(train)
forecasts = linear_model.forecast(H=30, exog=test[cat_vars])

# plot the forecast and the actual values
plt.figure(figsize=(12, 6))
plt.plot(train.index[-120:], train['sales'][-120:], label='Train')
plt.plot(test.index, test['sales'], label='Test')
plt.plot(test.index, forecasts, label='Linear Regression Forecast')
plt.title('Linear Regression Forecast')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()

# How the transformed features look like for Linear Regression
linear_model.X.head()

	week_day_2	week_day_3	week_day_4	week_day_5	week_day_6	...	sales_lag_3	sales_lag_4	sales_lag_5	sales_lag_6	sales_lag_7	rolling_std_30_shift_1	rolling_mean_30_shift_7	rolling_quantile_30_shift_1_q0.25	rolling_quantile_30_shift_1_q0.75	expanding_mean_shift_1
2020-01-08	1.0	0.0	0.0	0.0	0.0	...	6.993242	9.691764	10.366645	10.565377	10.723839	1.581041	10.723839	8.577902	10.569034	9.482514
2020-01-09	0.0	1.0	0.0	0.0	0.0	...	7.464041	6.993242	9.691764	10.366645	10.565377	1.583857	10.644608	9.134833	10.610479	9.696410
2020-01-10	0.0	0.0	1.0	0.0	0.0	...	10.572692	7.464041	6.993242	9.691764	10.366645	1.509947	10.551954	9.691764	10.572692	9.793542
2020-01-11	0.0	0.0	0.0	1.0	0.0	...	11.193677	10.572692	7.464041	6.993242	9.691764	1.443708	10.336906	9.860484	10.572169	9.869489
2020-01-12	0.0	0.0	0.0	0.0	1.0	...	10.570600	11.193677	10.572692	7.464041	6.993242	1.460908	9.668173	8.937674	10.571646	9.716225

5 rows × 29 columns

Fourier Terms for Seasonal Patterns

For series with strong seasonal patterns, peshbeen can automatically generate Fourier terms as a DataFrame indexed to match the original series — making them ready to merge as exogenous variables in a single line. Calendar features such as month or day of week can be added directly to the same DataFrame, and peshbeen will automatically encode them as categorical variables. This covers a wide range of calendar effects, from weekend sales spikes to holiday demand shifts.

from peshbeen.transformations import fourier_terms
# create fourier terms for yearly seasonality with period 365 and number of terms 2 to be used as exogenous variables in the model
sales_exog = sales_data.copy() # create a copy of the original data to store the fourier terms

sales_exog.drop(columns=["month"], inplace=True) # drop month column because we will use fourier terms to capture the yearly seasonality instead of using month as a categorical variable
fourier_trms = fourier_terms(index=sales_exog.index, period=365, num_terms=2)
sales_exog = sales_exog.merge(fourier_trms, left_index=True, right_index=True) # merge the fourier terms with the original data to be used as exogenous variables in the model
sales_exog.head()

	sales	week_day	sin_1_365	sin_2_365	cos_1_365	cos_2_365
2020-01-01	40.474019	2	0.000000	0.000000	1.000000	1.000000
2020-01-02	39.472174	3	0.017213	0.034422	0.999852	0.999407
2020-01-03	38.233477	4	0.034422	0.068802	0.999407	0.997630
2020-01-04	34.174336	5	0.051620	0.103102	0.998667	0.994671
2020-01-05	20.219602	6	0.068802	0.137279	0.997630	0.990532

# split the data into train and test
from sklearn.linear_model import Lasso
train_exog = sales_exog.iloc[:-30]
test_exog = sales_exog.iloc[-30:].drop(columns=['sales']) # drop the target column from the exogenous variables for the test set
# ml forecast using Linear Regression with fourier terms as exogenous variables
lr_model = ml_forecaster(model=Lasso(alpha=0.1),
              target_col='sales', lags = 7, lag_transform=transformations,
              cat_variables=["week_day"], categorical_encoder=ohe)
lr_model.fit(train_exog)
lr_forecast = lr_model.forecast(H=30, exog=test_exog)
# plot the forecast and the actual values
plt.figure(figsize=(12, 6))
plt.plot(train.index[-120:], train['sales'][-120:], label='Train')
plt.plot(test.index, test['sales'], label='Test')
plt.plot(test.index, lr_forecast, label='Linear Regression Forecast with Fourier Terms')
plt.title('Linear Regression Forecast with Fourier Terms')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()