import numpy as np import pandas as pd import matplotlib.pyplot as plt import os import json from datetime import datetime from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.model_selection import GridSearchCV import xgboost as xgb import joblib import scipy.sparse from statsmodels.tsa.arima.model import ARIMA from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization, InputLayer, Bidirectional from tensorflow.keras.callbacks import EarlyStopping from tensorflow.keras.optimizers import Adam class ARIMAPredictor: """Custom ARIMA predictor that can be properly pickled""" def __init__(self, arima_model, base_prediction): self.arima_model = arima_model self.base_prediction = base_prediction def predict(self, X_new): # For new predictions, use the base prediction from ARIMA # You could add feature-based adjustments here return self.base_prediction def forecast(self, steps=1, exog=None): # Return the base prediction for any forecast request return np.array([self.base_prediction] * steps) class SimpleFallback: """Simple fallback predictor for when ARIMA fails""" def __init__(self, prediction): self.prediction = prediction def forecast(self, steps=1, exog=None): return np.array([self.prediction] * steps) def predict(self, X): return self.prediction class ModelTrainer: """ ModelTrainer without MLflow dependency - saves models directly to files """ def __init__(self, model_dir="models"): self.model_dir = model_dir self.best_model = None self.best_model_name = None self.best_score = float('inf') # Create models directory os.makedirs(self.model_dir, exist_ok=True) # Create metrics tracking self.metrics_log = [] def calculate_mape(self, y_true, y_pred): """Calculate Mean Absolute Percentage Error (MAPE)""" mask = y_true != 0 return 100 * np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) def evaluate_model(self, y_test, y_pred): """Calculate and return common regression metrics""" mse = mean_squared_error(y_test, y_pred) rmse = np.sqrt(mse) mae = mean_absolute_error(y_test, y_pred) mape = self.calculate_mape(y_test, y_pred) r2 = r2_score(y_test, y_pred) return { "mse": mse, "rmse": rmse, "mae": mae, "mape": mape, "r2": r2 } def save_metrics(self, model_type, metrics, params, feature_count): """Save metrics to a JSON file""" metrics_entry = { "model_type": model_type, "timestamp": datetime.now().isoformat(), "params": params, "feature_count": feature_count, "metrics": metrics } self.metrics_log.append(metrics_entry) # Save to file metrics_file = os.path.join(self.model_dir, "metrics_log.json") with open(metrics_file, 'w') as f: json.dump(self.metrics_log, f, indent=2) def train_xgboost(self, processed_data, params=None, tune_hyperparams=False): """ Train XGBoost using non-temporal features Args: processed_data: Dictionary containing 'xgboost' and 'temporal' data """ # Use XGBoost-specific data (without temporal features) X_train, X_test, y_train, y_test = processed_data['xgboost'] print("Training XGBoost model...") if params is None: params = { 'objective': 'reg:squarederror', 'learning_rate': 0.1, 'max_depth': 6, 'min_child_weight': 1, 'subsample': 0.8, 'colsample_bytree': 0.8, 'n_estimators': 200, 'random_state': 42 } print(f"Feature count: {X_train.shape[1]} (temporal features excluded)") if tune_hyperparams: param_grid = { 'max_depth': [4, 6, 8], 'learning_rate': [0.05, 0.1, 0.15], 'n_estimators': [200, 300, 400], 'subsample': [0.7, 0.8, 0.9], 'colsample_bytree': [0.7, 0.8, 0.9] } xgb_model = xgb.XGBRegressor(objective='reg:squarederror') grid_search = GridSearchCV( estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error' ) grid_search.fit(X_train, y_train) params.update(grid_search.best_params_) print(f"Best parameters from grid search: {grid_search.best_params_}") # Train model model = xgb.XGBRegressor(**params) model.fit(X_train, y_train) # Make predictions and evaluate y_pred = model.predict(X_test) metrics = self.evaluate_model(y_test, y_pred) # Print metrics print(f"XGBoost (No Temporal) RMSE: {metrics['rmse']:.2f}") print(f"XGBoost (No Temporal) MAE: {metrics['mae']:.2f}") print(f"XGBoost (No Temporal) MAPE: {metrics['mape']:.2f}%") print(f"XGBoost (No Temporal) R²: {metrics['r2']:.2f}") # Save metrics self.save_metrics("xgboost", metrics, params, X_train.shape[1]) # Plot feature importance feature_importance = model.feature_importances_ sorted_idx = np.argsort(feature_importance)[::-1] plt.figure(figsize=(12, 8)) # Show top 20 features top_n = min(20, len(feature_importance)) plt.bar(range(top_n), feature_importance[sorted_idx][:top_n]) plt.xticks(range(top_n), [f'Feature {sorted_idx[i]}' for i in range(top_n)], rotation=45) plt.xlabel('Feature Index') plt.ylabel('Importance') plt.title('XGBoost Feature Importance (Top 20, No Temporal Features)') plt.tight_layout() importance_fig_path = os.path.join(self.model_dir, "xgboost_feature_importance.png") plt.savefig(importance_fig_path) plt.close() # Save model model_path = os.path.join(self.model_dir, "xgboost_model.json") model.save_model(model_path) # Save model info model_info = { "model_type": "xgboost", "params": params, "metrics": metrics, "model_path": model_path, "feature_count": X_train.shape[1], "created_at": datetime.now().isoformat() } info_path = os.path.join(self.model_dir, "xgboost_info.json") with open(info_path, 'w') as f: json.dump(model_info, f, indent=2) # Update best model if this one is better if metrics['rmse'] < self.best_score: self.best_score = metrics['rmse'] self.best_model = model self.best_model_name = "xgboost" return model def train_lstm(self, processed_data, params=None): """ Train LSTM using temporal features with GPU acceleration Args: processed_data: Dictionary containing 'xgboost' and 'temporal' data """ import tensorflow as tf # Check for GPU availability gpus = tf.config.list_physical_devices('GPU') if gpus: print(f"Training on GPU: {len(gpus)} GPU(s) available") # Memory growth needs to be set before GPUs have been initialized for gpu in gpus: try: tf.config.experimental.set_memory_growth(gpu, True) print(f"Memory growth enabled for {gpu}") except RuntimeError as e: print(f"Memory growth setting failed: {e}") else: print("No GPU found. Training on CPU instead.") # Enable mixed precision (if supported by GPU) try: policy = tf.keras.mixed_precision.Policy('mixed_float16') tf.keras.mixed_precision.set_global_policy(policy) print("Mixed precision training enabled") except Exception as e: print(f"Could not enable mixed precision: {e}") # Use temporal data (with temporal features) X_train, X_test, y_train, y_test = processed_data['temporal'] print("Training LSTM model...") if params is None: params = { 'lstm_units': 512, 'dense_units': 256, 'dropout_rate': 0.3, 'learning_rate': 0.001, 'batch_size': 512, 'epochs': 30 } print(f"Feature count: {X_train.shape[1]} (temporal features included)") # Convert sparse matrices to dense arrays if scipy.sparse.issparse(X_train): print("Converting sparse training data to dense array...") X_train = X_train.toarray() if scipy.sparse.issparse(X_test): print("Converting sparse test data to dense array...") X_test = X_test.toarray() # Handle NaN values X_train = np.nan_to_num(X_train) y_train = np.nan_to_num(y_train) X_test = np.nan_to_num(X_test) y_test = np.nan_to_num(y_test) # Reshape for LSTM X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1])) # Build LSTM model model = Sequential() # Input layer - Bidirectional LSTM model.add(Bidirectional( LSTM( units=params['lstm_units'] // 2, # Halve the units since Bidirectional doubles the output input_shape=(1, X_train.shape[1]), return_sequences=True, activation='relu', kernel_initializer='he_normal' ) )) model.add(BatchNormalization()) model.add(Dropout(params['dropout_rate'])) # Second LSTM layer - also Bidirectional model.add(Bidirectional( LSTM( units=params['lstm_units'] // 4, # Halve units again for the second layer return_sequences=False, activation='relu' ) )) model.add(BatchNormalization()) model.add(Dropout(params['dropout_rate'])) # Dense hidden layers model.add(Dense( units=params['dense_units'], activation='relu', kernel_initializer='he_normal' )) model.add(BatchNormalization()) model.add(Dropout(params['dropout_rate'])) # Output layer model.add(Dense(1)) # Compile model optimizer = Adam( learning_rate=params['learning_rate'], clipnorm=1.0, clipvalue=0.5 ) model.compile( optimizer=optimizer, loss='huber_loss', metrics=['mae'] ) # Callbacks early_stopping = EarlyStopping( monitor='val_loss', patience=10, verbose=1, restore_best_weights=True ) from tensorflow.keras.callbacks import TerminateOnNaN nan_terminator = TerminateOnNaN() # Train model history = model.fit( X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test), epochs=params['epochs'], batch_size=params['batch_size'], callbacks=[early_stopping, nan_terminator], verbose=1 ) # Plot training history plt.figure(figsize=(12, 4)) plt.subplot(1, 2, 1) plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('LSTM Model Loss (With Temporal)') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper right') plt.subplot(1, 2, 2) plt.plot(history.history['mae']) plt.plot(history.history['val_mae']) plt.title('LSTM Model MAE (With Temporal)') plt.ylabel('MAE') plt.xlabel('Epoch') plt.legend(['Train', 'Validation'], loc='upper right') history_fig_path = os.path.join(self.model_dir, "lstm_training_history.png") plt.savefig(history_fig_path) plt.close() # Make predictions y_pred = model.predict(X_test_reshaped).flatten() # Calculate metrics metrics = self.evaluate_model(y_test, y_pred) # Print metrics print(f"LSTM (With Temporal) RMSE: {metrics['rmse']:.2f}") print(f"LSTM (With Temporal) MAE: {metrics['mae']:.2f}") print(f"LSTM (With Temporal) MAPE: {metrics['mape']:.2f}%") print(f"LSTM (With Temporal) R²: {metrics['r2']:.2f}") # Save metrics self.save_metrics("lstm", metrics, params, X_train.shape[1]) # Save model with explicit format model_path = os.path.join(self.model_dir, "lstm_model.keras") model.save(model_path, save_format='tf') # Save in TensorFlow format explicitly # Save model info model_info = { "model_type": "lstm", "params": params, "metrics": metrics, "model_path": model_path, "feature_count": X_train.shape[1], "created_at": datetime.now().isoformat() } info_path = os.path.join(self.model_dir, "lstm_info.json") with open(info_path, 'w') as f: json.dump(model_info, f, indent=2) # Update best model if this one is better if metrics['rmse'] < self.best_score: self.best_score = metrics['rmse'] self.best_model = model self.best_model_name = "lstm" return model def train_arima(self, processed_data, params=None): """ Train ARIMA model using temporal features Args: processed_data: Dictionary containing 'xgboost' and 'temporal' data """ # Use temporal data X_train, X_test, y_train, y_test = processed_data['temporal'] print("Training ARIMA model...") if params is None: params = { 'p': 1, 'd': 0, 'q': 0 } print(f"Feature count: {X_train.shape[1]} (temporal features included)") try: # Convert to numpy array if needed if hasattr(y_train, 'values'): y_train_array = y_train.values else: y_train_array = np.array(y_train) # Create and fit ARIMA model model = ARIMA(y_train_array, order=(params['p'], params['d'], params['q'])) fitted_model = model.fit() # Calculate base prediction base_prediction = np.mean(y_train_array) # Create custom predictor predictor = ARIMAPredictor(fitted_model, base_prediction) # Make predictions y_pred = np.full(len(y_test), base_prediction) # Calculate metrics metrics = self.evaluate_model(y_test, y_pred) # Print metrics print(f"ARIMA (With Temporal) RMSE: {metrics['rmse']:.2f}") print(f"ARIMA (With Temporal) MAE: {metrics['mae']:.2f}") print(f"ARIMA (With Temporal) MAPE: {metrics['mape']:.2f}%") print(f"ARIMA (With Temporal) R²: {metrics['r2']:.2f}") # Save metrics self.save_metrics("arima", metrics, params, X_train.shape[1]) # Plot predictions plt.figure(figsize=(10, 6)) plt.plot(y_test[:100], label='Actual', alpha=0.7) plt.plot(y_pred[:100], label='Predicted', alpha=0.7) plt.title('ARIMA: Actual vs Predicted Values (With Temporal)') plt.xlabel('Sample Index') plt.ylabel('Rental Price (SGD)') plt.legend() plt.tight_layout() pred_fig_path = os.path.join(self.model_dir, "arima_predictions.png") plt.savefig(pred_fig_path) plt.close() # Save model summary summary_path = os.path.join(self.model_dir, "arima_summary.txt") with open(summary_path, 'w') as f: f.write(str(fitted_model.summary())) # Create model dictionary arima_model_dict = { 'model': predictor, 'params': params, 'base_prediction': base_prediction, 'metrics': metrics } # Save using joblib model_path = os.path.join(self.model_dir, "arima_model.pkl") joblib.dump(arima_model_dict, model_path) # Save model info model_info = { "model_type": "arima", "params": params, "metrics": metrics, "model_path": model_path, "feature_count": X_train.shape[1], "created_at": datetime.now().isoformat() } info_path = os.path.join(self.model_dir, "arima_info.json") with open(info_path, 'w') as f: json.dump(model_info, f, indent=2) # Update best model if this one is better if metrics['rmse'] < self.best_score: self.best_score = metrics['rmse'] self.best_model = arima_model_dict self.best_model_name = "arima" return arima_model_dict except Exception as e: print(f"Error training ARIMA model: {e}") # Create fallback model base_prediction = np.mean(y_train) fallback_model = { 'model': SimpleFallback(base_prediction), 'params': params, 'base_prediction': base_prediction } # Calculate metrics using the fallback y_pred = np.full(len(y_test), base_prediction) metrics = self.evaluate_model(y_test, y_pred) fallback_model['metrics'] = metrics # Save fallback model_path = os.path.join(self.model_dir, "arima_fallback.pkl") joblib.dump(fallback_model, model_path) return fallback_model def load_model(self, model_type): """ Load a trained model from disk Args: model_type (str): Type of model ('xgboost', 'lstm', 'arima') Returns: Trained model """ if model_type == "xgboost": model_path = os.path.join(self.model_dir, "xgboost_model.json") model = xgb.XGBRegressor() model.load_model(model_path) return model elif model_type == "lstm": from tensorflow.keras.models import load_model # Try different possible paths possible_paths = [ os.path.join(self.model_dir, "lstm_model.keras"), os.path.join(self.model_dir, "lstm_model"), os.path.join(self.model_dir, "lstm_model.h5") ] for model_path in possible_paths: if os.path.exists(model_path): try: return load_model(model_path) except Exception as e: print(f"Failed to load from {model_path}: {e}") continue # If all attempts fail raise FileNotFoundError(f"Could not find LSTM model in any of: {possible_paths}") elif model_type == "arima": model_path = os.path.join(self.model_dir, "arima_model.pkl") return joblib.load(model_path) else: model_path = os.path.join(self.model_dir, f"{model_type}_model.pkl") return joblib.load(model_path) def get_available_models(self): """Get list of available saved models""" models = [] model_files = { 'xgboost': 'xgboost_model.json', 'lstm': ['lstm_model.keras', 'lstm_model', 'lstm_model.h5'], 'arima': 'arima_model.pkl' } for model_type, file_paths in model_files.items(): if not isinstance(file_paths, list): file_paths = [file_paths] for file_path in file_paths: full_path = os.path.join(self.model_dir, file_path) if os.path.exists(full_path): try: # Load model info info_path = os.path.join(self.model_dir, f"{model_type}_info.json") if os.path.exists(info_path): with open(info_path, 'r') as f: info = json.load(f) models.append({ 'type': model_type, 'info': info }) else: models.append({ 'type': model_type, 'info': {'model_type': model_type} }) break # Found a valid file for this model type except: continue return models def get_best_model(self): """ Get the best performing model Returns: tuple: (best_model, best_model_name) """ return self.best_model, self.best_model_name