28 min readNov 18, 2024

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# ===========================
# 1️⃣ Data Preprocessing
# ===========================

# Example data: (ICD code sequence, [total spend, length of stay], label)
data = [
    (["E11", "I10", "J45"], [2000.0, 3.5], 1),
    (["I50", "E78"], [3000.0, 4.0], 1),
    (["Z00", "J44"], [1500.0, 2.0], 0),
    (["E11", "I10", "J45", "N18"], [2500.0, 5.0], 1),
    (["E78"], [1000.0, 1.5], 0),
]

# Set a maximum sequence length for padding/truncation.
max_len = 50  # You can adjust this as needed (e.g., 100 or 200) for your data.

# Get all unique ICD codes.
# We later add +1 so that index 0 is reserved for padding.
all_icd_codes = sorted({code for seq, _, _ in data for code in seq})
icd_encoder = LabelEncoder()
icd_encoder.fit(all_icd_codes)

def encode_sequence(seq, max_len=max_len):
    # Transform the codes and add +1 to each so that valid codes start from 1.
    encoded = [code + 1 for code in icd_encoder.transform(seq).tolist()]
    # Truncate if needed and pad with 0s (padding index)
    seq_len = min(len(encoded), max_len)
    padded = encoded[:seq_len] + [0] * (max_len - seq_len)
    return padded, seq_len

# Prepare arrays for ICD sequences, additional features, sequence lengths, and labels.
X_icd, X_lengths, X_features, y = [], [], [], []
for seq, features, label in data:
    enc_seq, seq_len = encode_sequence(seq, max_len=max_len)
    X_icd.append(enc_seq)
    X_lengths.append(seq_len)
    X_features.append(features)
    y.append(label)

X_icd = np.array(X_icd)
X_lengths = np.array(X_lengths)
X_features = np.array(X_features, dtype=np.float32)
y = np.array(y, dtype=np.float32)

# Split the data into training and test sets.
(X_icd_train, X_icd_test,
 X_feat_train, X_feat_test,
 lengths_train, lengths_test,
 y_train, y_test) = train_test_split(X_icd, X_features, X_lengths, y, test_size=0.2, random_state=42)

# Convert to PyTorch tensors.
X_icd_train = torch.tensor(X_icd_train, dtype=torch.long)
X_icd_test = torch.tensor(X_icd_test, dtype=torch.long)
X_feat_train = torch.tensor(X_feat_train, dtype=torch.float32)
X_feat_test = torch.tensor(X_feat_test, dtype=torch.float32)
lengths_train = torch.tensor(lengths_train, dtype=torch.long)
lengths_test = torch.tensor(lengths_test, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# ===========================
# 2️⃣ RETAIN Model Definition with Dynamic Packing
# ===========================

class RETAIN(nn.Module):
    def __init__(self, vocab_size, embed_dim=16, hidden_dim=32, additional_feature_dim=2):
        super(RETAIN, self).__init__()
        
        # Embedding layer for ICD codes.
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # GRU for visit-level attention (alpha) and feature-level attention (beta).
        self.rnn_alpha = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.rnn_beta = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        
        # Linear layers to produce attention weights.
        self.alpha_fc = nn.Linear(hidden_dim, 1)      # Produces a scalar per time step.
        self.beta_fc = nn.Linear(hidden_dim, embed_dim) # Produces a vector per time step.
        
        # Final fully connected layer for binary classification.
        # Input is the concatenation of the RETAIN context vector and additional features.
        self.final_fc = nn.Linear(embed_dim + additional_feature_dim, 1)
    
    def reverse_sequences(self, x, lengths):
        # x: (batch, seq_len, embed_dim)
        # Reverse each sequence individually up to its valid length.
        x_reversed = x.clone()
        batch_size, seq_len, _ = x.size()
        for i in range(batch_size):
            l = lengths[i]
            if l > 0:
                x_reversed[i, :l] = x[i, :l].flip(0)
        return x_reversed

    def forward(self, x, additional_features, lengths):
        # x: (batch, seq_len) ICD code indices.
        # additional_features: (batch, additional_feature_dim).
        # lengths: (batch,) actual sequence lengths.
        
        # 1. Get ICD embeddings: (batch, seq_len, embed_dim)
        x_embed = self.embedding(x)
        
        # 2. Reverse each sequence individually (only the valid tokens).
        x_embed_reversed = self.reverse_sequences(x_embed, lengths)
        
        # 3. Pack the reversed sequence.
        # Note: enforce_sorted=False allows unsorted batch.
        packed_input = nn.utils.rnn.pack_padded_sequence(x_embed_reversed, lengths.cpu(), 
                                                         batch_first=True, enforce_sorted=False)
        # Process through the GRUs.
        packed_alpha_out, _ = self.rnn_alpha(packed_input)
        packed_beta_out, _ = self.rnn_beta(packed_input)
        
        # Unpack the sequences back to padded tensors.
        # total_length=x.size(1) ensures we get the full padded sequence length.
        alpha_out, _ = nn.utils.rnn.pad_packed_sequence(packed_alpha_out, batch_first=True, 
                                                         total_length=x.size(1))
        beta_out, _ = nn.utils.rnn.pad_packed_sequence(packed_beta_out, batch_first=True, 
                                                        total_length=x.size(1))
        
        # 4. Compute attention weights.
        # Visit-level attention: produce a scalar per time step and apply softmax.
        alpha = torch.softmax(self.alpha_fc(alpha_out).squeeze(-1), dim=1)  # (batch, seq_len)
        # Feature-level attention: produce a vector per time step and apply tanh.
        beta = torch.tanh(self.beta_fc(beta_out))  # (batch, seq_len, embed_dim)
        
        # 5. Compute the context vector.
        # Multiply element-wise: alpha (expanded), beta, and the reversed embeddings.
        context = (alpha.unsqueeze(-1) * beta * x_embed_reversed).sum(dim=1)  # (batch, embed_dim)
        
        # 6. Concatenate with additional features and pass through final layer.
        combined = torch.cat([context, additional_features], dim=1)
        output = torch.sigmoid(self.final_fc(combined)).squeeze(-1)
        return output

# Initialize the model.
vocab_size = len(icd_encoder.classes_) + 1  # +1 because index 0 is reserved for padding.
additional_feature_dim = X_features.shape[1]  # e.g., total spend and length of stay.
model = RETAIN(vocab_size, embed_dim=16, hidden_dim=32, additional_feature_dim=additional_feature_dim)

# ===========================
# 3️⃣ Model Training
# ===========================

criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

epochs = 20
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass with ICD sequence, additional features, and sequence lengths.
    outputs = model(X_icd_train, X_feat_train, lengths_train)
    loss = criterion(outputs, y_train)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 5 == 0:
        print(f"Epoch [{epoch}/{epochs}], Loss: {loss.item():.4f}")

# ===========================
# 4️⃣ Model Evaluation
# ===========================

model.eval()
with torch.no_grad():
    y_pred = model(X_icd_test, X_feat_test, lengths_test)
    y_pred_class = (y_pred > 0.5).float()
    accuracy = (y_pred_class == y_test).float().mean().item()
    print(f"Test Accuracy: {accuracy:.4f}")

Hi Rupakanth,

Soubhagya worked on the classification version of the technical report. We had planned to create a separate regression version as well, but due to Soubhagya’s limited availability, that work is yet to start.

Aseem/Soubhagya — Rupakanth is looking for the regression version of the technical report. Could you please confirm if there are any plans for Soubhagya to revisit this work for generating the regression report?

You can access detailed descriptive statistical information for features related to clinical code buckets, product information, and chronic conditions via the following link: <AddLink>.

The goal is to leverage historical, month-on-month data to develop predictive classification models for various healthcare and risk assessment use cases, such as admission risk, cancer risk, and diabetes risk. Each model will predict the likelihood of specific events or conditions occurring within a defined one-year response period, based on features derived from a preceding one-year period. This enables proactive identification of high-risk individuals and informed decision-making to improve healthcare outcomes.

The ASO Flag identifies senior executives or high-ranking individuals within the company who are entitled to full reimbursement of their insurance claims. Under this arrangement, the insurance provider reimburses these claims, and the company directly funds the reimbursement to the provider.

Code For CLF Pipeline

import pandas as pd
from sklearn.model_selection import train_test_split


class DataIngestion:
    def __init__(self, file_path=None, db_connection=None, query=None):
        """
        Initializes the DataIngestion class.
        
        :param file_path: Path to the CSV file (if loading data from a file).
        :param db_connection: Database connection object (if loading data from a database).
        :param query: SQL query to execute (if loading data from a database).
        """
        self.file_path = file_path
        self.db_connection = db_connection
        self.query = query
        self.data = None

    def load_data(self):
        """
        Loads data from the specified source.
        """
        if self.file_path:
            self.data = pd.read_csv(self.file_path)
        elif self.db_connection and self.query:
            self.data = pd.read_sql(self.query, self.db_connection)
        else:
            raise ValueError("Provide either a file_path or a db_connection with a query.")
        print("Data loaded successfully.")
        return self.data

    def apply_filters(self, filters=None):
        """
        Applies user-defined filters to the data.
        
        :param filters: A dictionary where keys are column names and values are filter conditions (callable functions).
        Example: {"column_name": lambda x: x > 10}
        """
        if filters:
            for column, condition in filters.items():
                if column in self.data.columns:
                    self.data = self.data[self.data[column].apply(condition)]
                else:
                    raise ValueError(f"Column '{column}' not found in the data.")
        print("Filters applied successfully.")
        return self.data

    def split_data(self, target_column, test_size=0.2, random_state=42):
        """
        Splits the data into train and test sets.
        
        :param target_column: The name of the target column.
        :param test_size: Proportion of the data to include in the test set.
        :param random_state: Random state for reproducibility.
        :return: X_train, X_test, y_train, y_test
        """
        if target_column not in self.data.columns:
            raise ValueError(f"Target column '{target_column}' not found in the data.")
        
        X = self.data.drop(columns=[target_column])
        y = self.data[target_column]
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        print(f"Data split successfully: Train size = {len(X_train)}, Test size = {len(X_test)}")
        return X_train, X_test, y_train, y_test


# Example Usage
if __name__ == "__main__":
    # Initialize the DataIngestion class with a file path
    data_ingestion = DataIngestion(file_path="data.csv")
    
    # Load the data
    data = data_ingestion.load_data()
    
    # Apply custom filters
    filters = {
        "age": lambda x: x > 18,  # Keep rows where 'age' > 18
        "salary": lambda x: x < 100000  # Keep rows where 'salary' < 100,000
    }
    filtered_data = data_ingestion.apply_filters(filters=filters)
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = data_ingestion.split_data(target_column="target")

Code for LSTM

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from statsmodels.tsa.arima.model import ARIMA

# Load data
data = pd.read_csv('monthly_sales.csv', parse_dates=['Month'], index_col='Month')
data.plot(figsize=(10, 5), title='Monthly Sales Data')
plt.show()

# Check trend and seasonality
result = seasonal_decompose(data['Sales'], model='additive', period=12)
result.plot()
plt.show()

# Scale data
scaler = MinMaxScaler(feature_range=(0, 1))
data_scaled = scaler.fit_transform(data['Sales'].values.reshape(-1, 1))

# Create sequences
def create_sequences(data, seq_length):
    sequences, labels = [], []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length]
        label = data[i+seq_length]
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

seq_length = 12  # Use 12 months for prediction
X, y = create_sequences(data_scaled, seq_length)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Define Dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.y[index]

# Split data into train and test sets
train_size = int(len(X_tensor) * 0.8)
X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]

train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, num_layers=2, output_size=1):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

model = LSTM()

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {train_loss/len(train_loader):.4f}')

# Evaluate the model
model.eval()
predictions, actuals = [], []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        preds = model(X_batch).numpy()
        predictions.extend(preds)
        actuals.extend(y_batch.numpy())

# Reverse scaling
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, 1))
actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1))

# Plot results
plt.figure(figsize=(10, 5))
plt.plot(actuals, label='Actual Sales')
plt.plot(predictions, label='Predicted Sales')
plt.legend()
plt.title('Actual vs Predicted Sales')
plt.show()

# Forecast for next 12 months
last_sequence = X_tensor[-1].unsqueeze(0)  # Add batch dimension

future_predictions = []
model.eval()
with torch.no_grad():
    for _ in range(12):  # Forecast for 12 months
        prediction = model(last_sequence)  # Model output is 2D: [batch_size, output_size]
        future_predictions.append(prediction.item())  # Save prediction
        
        # Update last_sequence: keep it 3D (batch_size, seq_length, feature_size)
        prediction = prediction.unsqueeze(2)  # Add feature dimension: [batch_size, output_size, feature_size]
        last_sequence = torch.cat((last_sequence[:, 1:, :], prediction), dim=1)  # Shift and append new prediction

# Reverse scaling
future_predictions = scaler.inverse_transform(np.array(future_predictions).reshape(-1, 1))

# Plot future forecast
future_dates = pd.date_range(data.index[-1], periods=13, freq='M')[1:]
plt.plot(future_dates, future_predictions, label='Forecasted Sales')
plt.legend()
plt.show()

# Compare with ARIMA
arima_model = ARIMA(data['Sales'], order=(5, 1, 0))
arima_result = arima_model.fit()

# Forecast
arima_forecast = arima_result.forecast(steps=12)
plt.figure(figsize=(10, 5))
plt.plot(future_dates, arima_forecast, label='ARIMA Forecast')
plt.legend()
plt.title('ARIMA vs LSTM Forecast')
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Load dataset
data = pd.read_csv('your_data.csv', parse_dates=['Date'], index_col='Date')

# Separate features (X) and target (y)
X = data[['feature1', 'feature2', 'feature3', 'feature4', 'feature5']].values
y = data['y'].values

# Scale the data
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_x.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Create sequences
def create_sequences(X, y, n_steps):
    X_seq, y_seq = [], []
    for i in range(len(X) - n_steps):
        X_seq.append(X[i:i + n_steps])
        y_seq.append(y[i + n_steps])
    return np.array(X_seq), np.array(y_seq)

n_steps = 12
X_seq, y_seq = create_sequences(X_scaled, y_scaled)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y_seq, dtype=torch.float32)

# Train-test split
train_size = int(0.8 * len(X_tensor))
X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]

# DataLoader
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # Get the output of the last time step
        out = self.fc(out)
        return out

# Model parameters
input_size = X_train.shape[2]
hidden_size = 50
num_layers = 2
dropout = 0.2

# Initialize model
model = LSTMModel(input_size, hidden_size, num_layers, dropout)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 50
model.train()

for epoch in range(num_epochs):
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    # Print loss at each epoch
    if (epoch + 1) % 10 == 0:
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')


model.eval()
with torch.no_grad():
    y_pred = model(X_test.to(device)).cpu().numpy()
    y_test_actual = scaler_y.inverse_transform(y_test.numpy())
    y_pred_actual = scaler_y.inverse_transform(y_pred)

# Plot predictions vs actual
plt.figure(figsize=(10, 6))
plt.plot(y_test_actual, label='Actual')
plt.plot(y_pred_actual, label='Predicted')
plt.legend()
plt.title("Test Set Predictions vs Actual")
plt.show()

def monte_carlo_forecast(model, input_seq, n_steps, n_simulations=100):
    model.train()  # Enable dropout during inference
    input_seq = input_seq.to(device)
    forecast_mean = []
    forecast_lower = []
    forecast_upper = []

    for _ in range(n_steps):
        preds = []
        for _ in range(n_simulations):
            pred = model(input_seq).cpu().item()
            preds.append(pred)

        pred_mean = np.mean(preds)
        pred_std = np.std(preds)

        forecast_mean.append(pred_mean)
        forecast_lower.append(pred_mean - 1.96 * pred_std)  # 95% CI lower bound
        forecast_upper.append(pred_mean + 1.96 * pred_std)  # 95% CI upper bound

        # Update input sequence with the new prediction
        new_input = torch.cat((input_seq[0, 1:, :], torch.tensor([[pred_mean]], device=device).unsqueeze(0)), dim=1)
        input_seq = new_input.unsqueeze(0)

    # Inverse transform the results
    forecast_mean = scaler_y.inverse_transform(np.array(forecast_mean).reshape(-1, 1))
    forecast_lower = scaler_y.inverse_transform(np.array(forecast_lower).reshape(-1, 1))
    forecast_upper = scaler_y.inverse_transform(np.array(forecast_upper).reshape(-1, 1))

    return forecast_mean, forecast_lower, forecast_upper

# Forecast for next 12 steps
input_seq = X_tensor[-1].unsqueeze(0)
forecast_mean, forecast_lower, forecast_upper = monte_carlo_forecast(model, input_seq, n_steps=12)

# Plot forecast with confidence intervals
plt.figure(figsize=(10, 6))
plt.plot(range(1, 13), forecast_mean, marker='o', label='Forecast (Mean)', color='blue')
plt.fill_between(range(1, 13), forecast_lower.flatten(), forecast_upper.flatten(), color='blue', alpha=0.2, label='95% Confidence Interval')
plt.legend()
plt.title("12-Step Forecast with Confidence Intervals")
plt.xlabel("Future Steps")
plt.ylabel("Predicted y")
plt.show()

# Track losses
train_losses = []
test_losses = []

num_epochs = 50
model.train()

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # Testing phase
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            test_loss += loss.item()

    test_loss /= len(test_loader)
    test_losses.append(test_loss)

    # Print progress
    print(f'Epoch {epoch + 1}/{num_epochs}, Train MSE: {train_loss:.4f}, Test MSE: {test_loss:.4f}')

# Plot training and testing loss
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), train_losses, label='Train MSE', marker='o')
plt.plot(range(1, num_epochs + 1), test_losses, label='Test MSE', marker='o')
plt.xlabel("Epoch")
plt.ylabel("Mean Squared Error (MSE)")
plt.title("Train and Test MSE Over Epochs")
plt.legend()
plt.grid(True)
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import optuna

# Load dataset
data = pd.read_csv('your_data.csv', parse_dates=['Date'], index_col='Date')
y = data['y'].values

# Scale the data
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Create sequences
def create_sequences(y, n_steps):
    y_seq, y_target = [], []
    for i in range(len(y) - n_steps):
        y_seq.append(y[i:i + n_steps])
        y_target.append(y[i + n_steps])
    return np.array(y_seq), np.array(y_target)

n_steps = 12
X_seq, y_seq = create_sequences(y_scaled, n_steps)

X_tensor = torch.tensor(X_seq, dtype=torch.float32).unsqueeze(-1)  # Add feature dimension
y_tensor = torch.tensor(y_seq, dtype=torch.float32).unsqueeze(-1)

train_size = int(0.8 * len(X_tensor))
X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]

batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])
        return self.fc(out)

# Hyperparameter tuning with Optuna
def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 32, 128)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)

    model = LSTMModel(input_size=1, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout)
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(10):  # Fewer epochs for tuning
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            val_loss += criterion(model(X_batch), y_batch).item()
    return val_loss / len(test_loader)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
best_params = study.best_params

# Train final model
best_model = LSTMModel(
    input_size=1,
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout']
)
best_model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])

num_epochs = 50
for epoch in range(num_epochs):
    best_model.train()
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = best_model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

# Forecast with confidence intervals
def forecast_with_bootstrap(model, input_seq, n_steps, n_bootstraps=1000):
    model.eval()
    input_seq = input_seq.unsqueeze(0).to(device)

    forecast_values = []
    for _ in range(n_steps):
        preds = []
        for _ in range(n_bootstraps):
            noise = torch.normal(0, 0.01, input_seq.shape).to(device)
            input_with_noise = input_seq + noise
            pred = model(input_with_noise).item()
            preds.append(pred)

        pred_mean = np.mean(preds)
        pred_std = np.std(preds)
        forecast_values.append((pred_mean, pred_mean - 1.96 * pred_std, pred_mean + 1.96 * pred_std))

        input_seq = torch.cat((input_seq[:, 1:, :], torch.tensor([[[pred_mean]]], device=device)), dim=1)

    forecast_values = np.array(forecast_values)
    forecast_values = scaler_y.inverse_transform(forecast_values)
    return forecast_values

input_seq = X_tensor[-1].unsqueeze(0)
forecast_results = forecast_with_bootstrap(best_model, input_seq, n_steps=12)

forecast_mean = forecast_results[:, 0]
forecast_lower = forecast_results[:, 1]
forecast_upper = forecast_results[:, 2]

plt.figure(figsize=(10, 6))
plt.plot(range(1, 13), forecast_mean, label="Forecast Mean", color="blue", marker="o")
plt.fill_between(range(1, 13), forecast_lower, forecast_upper, color="blue", alpha=0.2, label="95% Confidence Interval")
plt.xlabel("Future Steps")
plt.ylabel("Predicted y")
plt.title("12-Step Forecast with Confidence Intervals")
plt.legend()
plt.grid()
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import optuna

# Load dataset
data = pd.read_csv('your_data.csv', parse_dates=['Date'], index_col='Date')
y = data['y'].values

# Scale the data
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Create sequences
def create_sequences(y, n_steps, n_future):
    X, y_target = [], []
    for i in range(len(y) - n_steps - n_future + 1):
        X.append(y[i:i + n_steps])
        y_target.append(y[i + n_steps:i + n_steps + n_future])
    return np.array(X), np.array(y_target)

n_steps = 24  # Sequence length (past values)
n_future = 12  # Number of future predictions
X_seq, y_seq = create_sequences(y_scaled, n_steps, n_future)

X_tensor = torch.tensor(X_seq, dtype=torch.float32).unsqueeze(-1)  # Add feature dimension
y_tensor = torch.tensor(y_seq, dtype=torch.float32)

train_size = int(0.8 * len(X_tensor))
X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]

batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, n_future):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, n_future)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # Use the output of the last time step
        return self.fc(out)

# Hyperparameter tuning with Optuna
def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 32, 128)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)

    model = LSTMModel(input_size=1, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, n_future=n_future)
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(10):  # Fewer epochs for tuning
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            val_loss += criterion(model(X_batch), y_batch).item()
    return val_loss / len(test_loader)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
best_params = study.best_params

# Train final model
best_model = LSTMModel(
    input_size=1,
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout'],
    n_future=n_future
)
best_model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])

num_epochs = 50
train_losses = []
test_losses = []
for epoch in range(num_epochs):
    best_model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = best_model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss / len(train_loader))

    # Evaluate on test set
    best_model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            test_loss += criterion(best_model(X_batch), y_batch).item()
    test_losses.append(test_loss / len(test_loader))

# Plot training and testing loss
plt.figure(figsize=(10, 6))
plt.plot(range(num_epochs), train_losses, label="Train Loss")
plt.plot(range(num_epochs), test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Training and Testing Loss Over Epochs")
plt.legend()
plt.grid()
plt.show()

# Forecast with confidence intervals
def forecast_with_bootstrap(model, input_seq, n_bootstraps=1000):
    model.eval()
    input_seq = input_seq.unsqueeze(0).to(device)
    preds = []
    for _ in range(n_bootstraps):
        noise = torch.normal(0, 0.01, input_seq.shape).to(device)
        input_with_noise = input_seq + noise
        pred = model(input_with_noise).squeeze(0).cpu().numpy()
        preds.append(pred)
    preds = np.array(preds)
    forecast_mean = preds.mean(axis=0)
    forecast_std = preds.std(axis=0)
    lower_bound = forecast_mean - 1.96 * forecast_std
    upper_bound = forecast_mean + 1.96 * forecast_std
    return forecast_mean, lower_bound, upper_bound

input_seq = X_tensor[-1]
forecast_mean, forecast_lower, forecast_upper = forecast_with_bootstrap(best_model, input_seq)

# Rescale predictions
forecast_mean = scaler_y.inverse_transform(forecast_mean.reshape(-1, 1)).flatten()
forecast_lower = scaler_y.inverse_transform(forecast_lower.reshape(-1, 1)).flatten()
forecast_upper = scaler_y.inverse_transform(forecast_upper.reshape(-1, 1)).flatten()

# Plot the forecast
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_future + 1), forecast_mean, label="Forecast Mean", color="blue", marker="o")
plt.fill_between(range(1, n_future + 1), forecast_lower, forecast_upper, color="blue", alpha=0.2, label="95% Confidence Interval")
plt.xlabel("Future Steps")
plt.ylabel("Predicted y")
plt.title("12-Step Forecast with Confidence Intervals")
plt.legend()
plt.grid()
plt.show()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import optuna

# Load dataset
data = pd.read_csv('your_data.csv', parse_dates=['Date'], index_col='Date')
y = data['y'].values

# Scale the data
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))

# Create sequences
def create_sequences(y, n_steps, n_future):
    X, y_target = [], []
    for i in range(len(y) - n_steps - n_future + 1):
        X.append(y[i:i + n_steps])
        y_target.append(y[i + n_steps:i + n_steps + n_future])
    return np.array(X), np.array(y_target)

n_steps = 24  # Sequence length (past values)
n_future = 12  # Number of future predictions
X_seq, y_seq = create_sequences(y_scaled, n_steps, n_future)

X_tensor = torch.tensor(X_seq, dtype=torch.float32).unsqueeze(-1)  # Add feature dimension
y_tensor = torch.tensor(y_seq, dtype=torch.float32)

train_size = int(0.8 * len(X_tensor))
X_train, X_test = X_tensor[:train_size], X_tensor[train_size:]
y_train, y_test = y_tensor[:train_size], y_tensor[train_size:]

batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)

# Define LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout, n_future):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, n_future)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.dropout(out[:, -1, :])  # Use the output of the last time step
        return self.fc(out)

# Hyperparameter tuning with Optuna
def objective(trial):
    hidden_size = trial.suggest_int("hidden_size", 32, 128)
    num_layers = trial.suggest_int("num_layers", 1, 3)
    dropout = trial.suggest_float("dropout", 0.1, 0.5)
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-2)

    model = LSTMModel(input_size=1, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, n_future=n_future)
    model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(10):  # Fewer epochs for tuning
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            val_loss += criterion(model(X_batch), y_batch).item()
    return val_loss / len(test_loader)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)
best_params = study.best_params

# Train final model
best_model = LSTMModel(
    input_size=1,
    hidden_size=best_params['hidden_size'],
    num_layers=best_params['num_layers'],
    dropout=best_params['dropout'],
    n_future=n_future
)
best_model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(best_model.parameters(), lr=best_params['lr'])

num_epochs = 50
train_losses = []
test_losses = []
for epoch in range(num_epochs):
    best_model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        y_pred = best_model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_losses.append(train_loss / len(train_loader))

    # Evaluate on test set
    best_model.eval()
    test_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            test_loss += criterion(best_model(X_batch), y_batch).item()
    test_losses.append(test_loss / len(test_loader))

# Plot training and testing loss
plt.figure(figsize=(10, 6))
plt.plot(range(num_epochs), train_losses, label="Train Loss")
plt.plot(range(num_epochs), test_losses, label="Test Loss")
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.title("Training and Testing Loss Over Epochs")
plt.legend()
plt.grid()
plt.show()

# Forecast with confidence intervals
def forecast_with_bootstrap(model, input_seq, n_bootstraps=1000):
    model.eval()
    input_seq = input_seq.unsqueeze(0).to(device)
    preds = []
    for _ in range(n_bootstraps):
        noise = torch.normal(0, 0.01, input_seq.shape).to(device)
        input_with_noise = input_seq + noise
        pred = model(input_with_noise).squeeze(0).cpu().numpy()
        preds.append(pred)
    preds = np.array(preds)
    forecast_mean = preds.mean(axis=0)
    forecast_std = preds.std(axis=0)
    lower_bound = forecast_mean - 1.96 * forecast_std
    upper_bound = forecast_mean + 1.96 * forecast_std
    return forecast_mean, lower_bound, upper_bound

input_seq = X_tensor[-1]
forecast_mean, forecast_lower, forecast_upper = forecast_with_bootstrap(best_model, input_seq)

# Rescale predictions
forecast_mean = scaler_y.inverse_transform(forecast_mean.reshape(-1, 1)).flatten()
forecast_lower = scaler_y.inverse_transform(forecast_lower.reshape(-1, 1)).flatten()
forecast_upper = scaler_y.inverse_transform(forecast_upper.reshape(-1, 1)).flatten()

# Plot the forecast
plt.figure(figsize=(10, 6))
plt.plot(range(1, n_future + 1), forecast_mean, label="Forecast Mean", color="blue", marker="o")
plt.fill_between(range(1, n_future + 1), forecast_lower, forecast_upper, color="blue", alpha=0.2, label="95% Confidence Interval")
plt.xlabel("Future Steps")
plt.ylabel("Predicted y")
plt.title("12-Step Forecast with Confidence Intervals")
plt.legend()
plt.grid()
plt.show()

Code for StatsModels

#Import necessary modules 
import pandas as pd
import numpy as np
%matplotlib online 
!pip install pmdarima
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARMA, ARMAResults
from statsmodels.tsa.SARIMAX import ARIMA, ARIMAResults
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf # for determining (p,q) orders
import warnings
warnings.filterwarnings(“ignore”)
#Read the data : assuming the format of data is as follows – 
#Month PMPM
#2023-02-01 1.4
#Note that the above date has the day fixed on the first of every month – last field is 01
df = pd.read_csv(‘path to file’,index_col=’ ‘,parse_dates=True)
df.index.freq= ‘’ #Choose – MS for month start, D for day etc …

#Plot 
import matplotlib.ticker as ticker
formatter = ticker.StrMethodFormatter('{x:,.0f}')
title = 'Monthly PMPM plot'
ylabel='PMPM'
xlabel='' # we don't really need a label here
ax = df['PMPM'].plot(figsize=(12,5),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
ax.yaxis.set_major_formatter(formatter);
#Run ETS decomposition 
from statsmodels.tsa.seasonal import seasonal_decompose
result = seasonal_decompose(df['PMPM'], model='additive')  # model='mult' -> check
result.plot();

#Automate the augmented Dickey-Fuller test to determine if your dataset is stationary or #not
from statsmodels.tsa.stattools import adfuller

def adf_test(series,title=''):
    "" Pass in a time series and an optional title, returns an ADF report """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)
    for key,val in result[4].items():
        out[f'critical value ({key})']=val
        print(out.to_string())          # .to_string() removes the line "dtype: float64"
    
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")
#Optional – if you want to know about the auto_arima moduls
help(auto_arima)
#Fit the model
model_fit = auto_arima(df['PMPM'], seasonal=False, trace=True) 
#seasonal false – we do SARIMA later
auto_arima(df[‘PMPM’]).summary()

#If the fit summary returns the order of the best model as (p,1,q) -> d=1 -> differencing 
#Check
from statsmodels.tsa.statespace.tools import diff
df['d1'] = diff(df['PMPM'],k_diff=1)
adf_test(df['d1'],'Monthly PMPM')
#Say after one differencing you might find there is no stationarity
#Model Fit 
model_fit = auto_arima(df['PMPM'], start_p=0, start_q=0,
                          max_p=2, max_q=2, m=12,
                          seasonal=False,
                          d=None, trace=True,
                          error_action='ignore',   # we don't want to know if an order does not work
                          suppress_warnings=True,  # we don't want convergence warnings
                          stepwise=True)           # set to stepwise

model_fit.summary()
#We are specifying seasonal = False because we are dealing with and ARIMA model here 
#train test split
train = df.iloc[ :len(df) - 12]
test = df.iloc[len(df): ]
#fit the model 
model = ARIMA(train[‘PMPM’],order= ()) #p,d,q as obtained above in from auto arima 
results = model.fit()
results.summary()
#Obtain predicted values 
start = len(train)
end = len(train) + len(test) – 1
predictions = results.predict(start=start, end=end, dynamic=False, typ=’levels’).rename(‘ARIMA(p,d,q) Predictions’)
#if d != 0  ie. we have done differencing , typ = ‘levels’ returns predictions of the original endogenous variables 
#Default is linear which would return predictions from the differenced terms 
# Plot predictions against known values
title = 'PMPM'
ylabel='Chained 2012 Dollars'
xlabel='' # we don't really need a label here

ax = test['Inventories'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
ax.yaxis.set_major_formatter(formatter);
#Evaluate 
from sklearn.metrics import mean_squared_error
error = mean_squared_error(test['PMPM'], predictions)
print(f'ARIMA(1,1,1) MSE Error: {error:11.10}')
from statsmodels.tools.eval_measures import rmse
error = rmse(test['PMPM'], predictions)
print(f'ARIMA(1,1,1) RMSE Error: {error:11.10}')
#Retrain the model on the entire dataset : replace p,d,q here
model = ARIMA(df['PMPM'],order=(1,1,1))
results = model.fit()
fcast = results.predict(len(df2),len(df2)+11,typ='levels').rename('ARIMA(p,d,q) Forecast')

#SARIMA 
#When there is seasonality
#Classic ARIMA accepts the parameters – p,d,q
#SARIMA accepts another additional level – P, D, Q – seasonal components 
# Load specific forecasting tools
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.seasonal import seasonal_decompose
# Load dataset
df = pd.read_csv('data.csv')
title = 'Monthly PMPM'
ylabel='PMPM'
xlabel='' # we don't really need a label here
ax = df['PMPM'].plot(figsize=(12,6),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);

#Recall the first step is to check for seasonality using ETS decomposition, the code has already been presented earlier 
# For SARIMA Orders we set seasonal=True and pass in an m value
auto_arima(df['interpolated'],seasonal=True,m=12).summary()
#Check the model details from the summary 
#Do the train test split as earlier … code provided earlier 
#Fit the model 
model = SARIMAX(train['interpolated'],order=(0,1,3),seasonal_order=(1,0,1,12))
results = model.fit()
results.summary()
#Obtain the predicted values – change order in the rename part as needed
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end, dynamic=False, typ='levels').rename('SARIMA(0,1,3)(1,0,1,12) Predictions')
#Next Seps 
#Plot predictions against known values – code as in earlier
#Evaluate the model on the test set – RMSE and MSE – code as in earlier 
#Retrain the model on the full data and forecast into the future – replace orders as found in the fit
model = SARIMAX(df['PMPM'],order=(0,1,3),seasonal_order=(1,0,1,12))
results = model.fit()
fcast = results.predict(len(df),len(df)+11,typ='levels').rename('SARIMA(0,1,3)(1,0,1,12) Forecast')
# Plot predictions against known values
title = 'Monthly PMPM'
ylabel='parts per million'
xlabel=' '
ax = df['PMPM'].plot(legend=True,figsize=(12,6),title=title)
fcast.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
#SARIMAX 
#Load the dataset as before 
#The dataset should have a column named “X”
#Plot the source data 
title=' PMPM Data'
ylabel='PMPM'
xlabel='' # we don't really need a label here

ax = df['PMPM'].plot(figsize=(16,5),title=title)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel);
#Run ETS decomposition 
result = seasonal_decompose(df1['PMPM'])
result.plot();
#Test for stationarity – use the function for the adf_test as provided earlier 
# For SARIMA Orders we set seasonal=True and pass in an m value
auto_arima(df['PMPM'],seasonal=True,m=7).summary() #Get the model details, set m to be the  seasonality eg. 7 for weekly 
#Train Test Split as earlier …
model = SARIMAX(train['PMPM'],order=(1,0,0),seasonal_order=(2,0,0,7),enforce_invertibility=False)
results = model.fit()
results.summary()
#Obtain predicted values and plot – insert correct orders
start=len(train)
end=len(train)+len(test)-1
predictions = results.predict(start=start, end=end, dynamic=False).rename('SARIMA(1,0,0)(2,0,0,7) Predictions') 
# Plot predictions against known values
title='PMPM'
ylabel='PMPM'
xlabel=''
ax = test['PMPM'].plot(legend=True,figsize=(12,6),title=title)
predictions.plot(legend=True)
ax.autoscale(axis='x',tight=True)
ax.set(xlabel=xlabel, ylabel=ylabel)
#Evaluate the model – RMSE, MSE – code provided earlier 
#Now add the exogenous variable 
model = SARIMAX(train['PMPA'],exog=train['exog variable'],order=(1,0,0),seasonal_order=(2,0,0,7),enforce_invertibility=False)
results = model.fit()
results.summary()
Now follow the steps (codes for these steps have been provided earlier )– 
• Obtain predicted values for the test set
• Plot predictions against known values of the test set 
• Evaluate the model 
• Retrain the model on the entire dataset and forecast for future
• Plot

Chatgpt code for time series

import numpy as np
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
import itertools
import warnings
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

warnings.filterwarnings("ignore")


class TimeSeriesForecaster:
    def __init__(self, data, column, frequency):
        """
        Initialize the TimeSeriesForecaster class.
        
        :param data: Input time series data (Pandas DataFrame).
        :param column: The column containing the time series data.
        :param frequency: The frequency of the time series (e.g., 'M', 'D').
        """
        self.data = data
        self.column = column
        self.frequency = frequency
        self.ts = data[column]
        self.ts.index = pd.to_datetime(data.index)
        self.ts = self.ts.asfreq(frequency)

    def test_stationarity(self):
        """
        Perform the Augmented Dickey-Fuller test for stationarity.
        """
        result = adfuller(self.ts.dropna())
        print("ADF Statistic:", result[0])
        print("p-value:", result[1])
        for key, value in result[4].items():
            print(f"Critical Value ({key}): {value}")
        if result[1] < 0.05:
            print("The series is stationary.")
        else:
            print("The series is not stationary.")

    def plot_acf_pacf(self, lags=20):
        """
        Plot the ACF and PACF of the series.
        """
        plt.figure(figsize=(12, 6))
        plt.subplot(121)
        plot_acf(self.ts.dropna(), lags=lags, ax=plt.gca())
        plt.subplot(122)
        plot_pacf(self.ts.dropna(), lags=lags, ax=plt.gca())
        plt.show()

    def grid_search_arima(self, p_range, d_range, q_range):
        """
        Perform grid search for ARIMA hyperparameters.
        """
        best_aic = float("inf")
        best_order = None
        best_model = None
        pdq = list(itertools.product(p_range, d_range, q_range))

        for order in pdq:
            try:
                model = ARIMA(self.ts, order=order)
                model_fit = model.fit()
                if model_fit.aic < best_aic:
                    best_aic = model_fit.aic
                    best_order = order
                    best_model = model_fit
            except:
                continue

        print(f"Best ARIMA Order: {best_order}, AIC: {best_aic}")
        return best_model

    def grid_search_sarima(self, p_range, d_range, q_range, P_range, D_range, Q_range, m):
        """
        Perform grid search for SARIMA hyperparameters.
        """
        best_aic = float("inf")
        best_order = None
        best_seasonal_order = None
        best_model = None
        pdq = list(itertools.product(p_range, d_range, q_range))
        seasonal_pdq = list(itertools.product(P_range, D_range, Q_range))

        for order in pdq:
            for seasonal_order in seasonal_pdq:
                try:
                    model = SARIMAX(
                        self.ts,
                        order=order,
                        seasonal_order=(seasonal_order[0], seasonal_order[1], seasonal_order[2], m),
                        enforce_stationarity=False,
                        enforce_invertibility=False,
                    )
                    model_fit = model.fit(disp=False)
                    if model_fit.aic < best_aic:
                        best_aic = model_fit.aic
                        best_order = order
                        best_seasonal_order = seasonal_order
                        best_model = model_fit
                except:
                    continue

        print(f"Best SARIMA Order: {best_order}, Seasonal Order: {best_seasonal_order}, AIC: {best_aic}")
        return best_model

    def fit_holt_winters(self, seasonal="add", seasonal_periods=12):
        """
        Fit a Holt-Winters model and forecast.
        """
        model = ExponentialSmoothing(
            self.ts, seasonal=seasonal, seasonal_periods=seasonal_periods
        )
        model_fit = model.fit()
        return model_fit

    def fit_sarimax(self, exog, order, seasonal_order, m):
        """
        Fit a SARIMAX model with exogenous variables.
        """
        model = SARIMAX(
            self.ts,
            exog=exog,
            order=order,
            seasonal_order=(seasonal_order[0], seasonal_order[1], seasonal_order[2], m),
            enforce_stationarity=False,
            enforce_invertibility=False,
        )
        model_fit = model.fit(disp=False)
        return model_fit

    def evaluate_model(self, actual, predicted):
        """
        Evaluate the model using RMSE.
        """
        rmse = np.sqrt(mean_squared_error(actual, predicted))
        print(f"RMSE: {rmse}")
        return rmse

    def run_full_analysis(self, params, steps=12, exog=None, seasonal_periods=12, m=12):
        """
        Run end-to-end time series analysis and forecasting.
        """
        print("Testing for Stationarity:")
        self.test_stationarity()

        print("\nPlotting ACF and PACF:")
        self.plot_acf_pacf()

        print("\nFitting ARIMA Model:")
        arima_model = self.grid_search_arima(
            p_range=params["p"], d_range=params["d"], q_range=params["q"]
        )
        arima_forecast = arima_model.forecast(steps=steps)
        print(f"ARIMA Forecast: {arima_forecast}")

        print("\nFitting SARIMA Model:")
        sarima_model = self.grid_search_sarima(
            p_range=params["p"],
            d_range=params["d"],
            q_range=params["q"],
            P_range=params["P"],
            D_range=params["D"],
            Q_range=params["Q"],
            m=m,
        )
        sarima_forecast = sarima_model.forecast(steps=steps)
        print(f"SARIMA Forecast: {sarima_forecast}")

        print("\nFitting Holt-Winters Model:")
        hw_model = self.fit_holt_winters(seasonal_periods=seasonal_periods)
        hw_forecast = hw_model.forecast(steps)
        print(f"Holt-Winters Forecast: {hw_forecast}")

        if exog is not None:
            print("\nFitting SARIMAX Model:")
            sarimax_model = self.fit_sarimax(
                exog=exog,
                order=(params["p"][0], params["d"][0], params["q"][0]),
                seasonal_order=(params["P"][0], params["D"][0], params["Q"][0], m),
                m=m,
            )
            sarimax_forecast = sarimax_model.forecast(steps=steps, exog=exog[-steps:])
            print(f"SARIMAX Forecast: {sarimax_forecast}")
            return arima_forecast, sarima_forecast, hw_forecast, sarimax_forecast

        return arima_forecast, sarima_forecast, hw_forecast

Write Up for Confluence —

Development of a New Classification Pipeline

This document outlines the objectives, architecture, and detailed plan for the development of a robust classification pipeline. The aim is to ensure a seamless process for model training, scoring, and retraining while maintaining optimal performance metrics.

Objective

The primary objective of this pipeline is to create a reliable and efficient framework to manage the lifecycle of classification models. The process involves:

Model Training:

Initial training of classification models using a defined dataset.
Ensuring robust training performance based on key metrics such as F1-score, accuracy, precision, and recall.

Model Scoring:

Regular scoring of the trained models on incoming data.
Comparing scoring performance metrics with the original training performance to assess model degradation over time.

Model Re-Training:

If the scoring performance falls below a defined threshold compared to the training metrics, a retraining process will be triggered.
The retraining ensures the model adapts to evolving data patterns and maintains high performance.

Architecture Overview

The architecture of the pipeline is designed in two development phases to ensure a structured and modular approach.

Phase 1: Core Module Development

In this phase, the foundational components of the pipeline will be developed:

Data Ingestion:

Implementation of a robust mechanism to fetch and preprocess data for model training and scoring.
Handling diverse data sources and formats with proper validation and error handling.

Model Training:

Development of a training module capable of handling classification tasks with configurable hyperparameters.
Logging of key training metrics and model artifacts for future reference.

Model Scoring:

A scoring mechanism that evaluates the trained model on new data.
Generation of performance metrics, enabling real-time insights into model effectiveness.

Logging and Monitoring:

Creation of a log table to record the summary of each iteration, including key metrics, timestamps, and model versions.
Ensuring traceability of all model-related activities.

Phase 2: Advanced Capabilities

Building upon the core module, this phase will add advanced features to complete the pipeline:

Model Monitoring:

Implementation of continuous monitoring to track model performance over time.
Integration of alerts and notifications in case of performance degradation.

Performance Analysis:

Analytical tools to compare model metrics from training and scoring phases.
Dashboards to visualize trends and identify patterns in model performance.

Model Re-Training:

Automating the retraining process triggered by predefined performance thresholds.
Incorporating mechanisms to update and deploy the retrained model seamlessly.

Process Automation:

Full automation of the end-to-end pipeline, reducing manual intervention.
Enhancing efficiency and scalability for handling large datasets and frequent updates.

Implementation Strategy

Technology Stack: The pipeline will leverage scalable and robust technologies for data processing, model building, and monitoring (e.g., Python, PySpark, TensorFlow/PyTorch, Airflow for orchestration).
Version Control: Versioning will be applied to datasets, models, and pipeline code to ensure reproducibility and traceability.
Testing and Validation: Comprehensive testing at each stage to validate the functionality and reliability of components.

Expected Outcomes

Efficiency: A streamlined process for managing classification models from training to retraining.
Scalability: Capability to handle increasing data volumes and model complexities.
Adaptability: Ensuring models remain relevant and accurate through periodic retraining.
Traceability: Transparent logging and monitoring for enhanced accountability and insights.

By implementing this pipeline, we aim to establish a robust foundation for classification tasks, ensuring consistent and high-performing models over time.

— — — — — — — —

Process Flow for Onboarding and Managing Clients/Models

New Client/Model (e.g., Admission Risk, Diabetes Risk, High Cost Risk, etc.)

When onboarding a new entity to the pipeline, the following process flow will be executed:

Trigger Model Training Pipeline:

The process begins by initiating the model training pipeline.
Refer to the Model Training Module for detailed training steps.

Log Model Training Details:

Upon successful training, an entry is created in the log table.
The entry includes information on model artifacts, performance metrics, and other relevant details.

Iteration Check:

The program control moves to the next iteration.
If subsequent datasets are available, the data scoring pipeline is triggered.
If no new dataset is available, the process ends.

Data Scoring Pipeline Execution:

The scoring pipeline evaluates the model’s performance on the new dataset.
Once scoring is complete:
Performance Check: The model’s scoring performance is monitored and compared against the threshold.
Above Threshold:
An entry for data scoring is added to the log table.
The program moves to the next iteration.
Below Threshold:
The model training pipeline is triggered to retrain the model on the current dataset.
Log table updates include:
Setting the newly trained model as the active model.
Marking the previous model as inactive.

Repeat Process:

The above steps are repeated for each available dataset in the input table for the upcoming months.

End Process:

When the pipeline processes the final month’s dataset, program control exits, and the process ends.

Existing Client/Model

For clients or models already present in the log table, the following process is followed:

Skip Initial Model Training:

The initial model training step is bypassed.

Direct to Data Scoring Pipeline:

The program control moves directly to the data scoring step.

Follow Remaining Steps:

The process continues as outlined in the Data Scoring Pipeline Execution and Repeat Process steps for new clients/models.

This structured flow ensures seamless onboarding, performance monitoring, and retraining of models to maintain high performance across all iterations.

— — — — — — — — —

Illustration: Onboarding a New Model — “Admission_Risk” for AIA_MY

Overview

For the onboarding of the new model “Admission_Risk,” the input data is provided as a feature-combined DataFrame, containing 54 months of combined data. Below is the step-by-step process:

Step 1: Data Ingestion

Step 1.1: Input Data Preprocessing and Ingestion

The model ingests the input data through the pipeline.
Refer to the shared screenshot (SS) for an example of how the input data is preprocessed and prepared for further pipeline processing.

Step 1.2: Condition Check for New Model Onboarding

Since “Admission_Risk” is a new model, the pipeline identifies it as a new onboarding case.
The program control moves to the Model Training phase.

Step 2: Model Training

Step 2.1: Model Training in Progress

The training process begins, using the preprocessed data to train the “Admission_Risk” model.

Step 2.2: Log Table Entry for Model Training

Once the model training is successfully completed:
A new entry is created in the log table with details such as model artifacts and training performance metrics.
Refer to the attached screenshot for an example of the log table entry.

Step 3: Iterative Process and Model Scoring

Step 3.1: Next Iteration and Model Scoring

The program control moves to the next iteration, checking for the availability of the next dataset.
Since additional data is available, the pipeline triggers the Model Scoring process.
Refer to the attached screenshot for an illustration of this step.

Step 3.2: Performance Check and Logging

During scoring, the model’s performance is checked against predefined thresholds:
Condition Met: As the performance meets or exceeds the thresholds, a scoring entry is logged in the log table.
The program control proceeds to the next iteration.

This step-by-step illustration highlights the pipeline’s ability to manage new model onboarding, training, scoring, and iterative processing seamlessly. The attached screenshots further clarify the flow and expected outputs at each stage.

— — — — — — — -

Log table

Log Table: Overview and Optimal Format

Purpose of the Log Table

The log table serves as the central repository to store metadata and performance metrics for the classification pipeline. It ensures transparency, traceability, and facilitates monitoring of the entire pipeline. Key functionalities include:

Recording details of trained models and scoring outputs.
Maintaining a history of model performance and data scoring.
Tracking the relationship between models, datasets, and use cases.
Providing the current status of data processing and associated activities.

Proposed Log Table Structure

Below is the optimal format for the log table, with suggested columns and their descriptions:

Column Name Description Log_ID Unique identifier for each log entry. Market The market associated with the data (e.g., AIA_MY, AIA_SG). Use_Case The specific use case being handled (e.g., Admission Risk, Diabetes Risk). Data_Timestamp Timestamp of the dataset used in the process. Model_Version Version of the model used for training or scoring. Model_Artifact_Path File path or location of the trained model artifacts. Training_Performance Key training performance metrics (e.g., F1-score, accuracy). Scoring_Data_Path File path or location of the scoring dataset. Scoring_Performance Performance metrics on the scoring dataset (e.g., F1-score, accuracy). Performance_Status Indicator if scoring performance met the threshold (e.g., “Pass” or “Fail”). Scoring_Status Current status of scoring (e.g., “Completed,” “Pending,” “Failed”). Retraining_Flag Indicator if retraining was triggered for the model (e.g., Yes/No). Active_Model_Status Indicates whether the model is currently active (e.g., “Active” or “Inactive”). Iteration_Count The iteration number in the process (helps track multiple runs). Log_Timestamp Timestamp when the log entry was created or updated. Remarks Any additional notes or observations related to the entry.

Populating the Log Table

Log Table Workflow

Model Training Phase:

Upon completing model training:
Insert an entry with details such as the model artifact path, training performance, and active model status.
Mark the Scoring_Status as “Pending.”

Model Scoring Phase:

After scoring the data:
Update the Scoring_Data_Path and Scoring_Performance columns.
Set Scoring_Status as “Completed.”
Evaluate performance metrics:
If performance passes thresholds, mark Performance_Status as “Pass.”
If performance fails thresholds, set Performance_Status as “Fail” and flag the Retraining_Flag as “Yes.”

Retraining Phase (if applicable):

For retraining cases:
Insert a new log entry for the retrained model.
Update the Active_Model_Status for the older model to “Inactive.”
Log the performance details of the retrained model and mark it as “Active.”

Example of a Populated Log Table

Log_ID Market Use_Case Data_Timestamp Model_Version Model_Artifact_Path Training_Performance Scoring_Data_Path Scoring_Performance Performance_Status Scoring_Status Retraining_Flag Active_Model_Status Iteration_Count Log_Timestamp Remarks 001 AIA_MY Admission Risk 2025–01–01 v1.0 /models/admission/v1.0 F1: 0.85, Acc: 0.92 /data/scoring/01–2025 F1: 0.83, Acc: 0.90 Pass Completed No Active 1 2025–01–02 Initial training entry 002 AIA_MY Admission Risk 2025–02–01 v1.0 /models/admission/v1.0 F1: 0.85, Acc: 0.92 /data/scoring/02–2025 F1: 0.75, Acc: 0.82 Fail Completed Yes Active 2 2025–02–02 Retraining triggered

Conclusion

The log table is a critical component of the pipeline, providing comprehensive tracking and visibility into the lifecycle of models and data scoring. By maintaining detailed and structured entries, the pipeline ensures efficient monitoring, debugging, and decision-making.