Spaces:

nightey3s
/

Rental-Yield-Prediction

Sleeping

App Files Files Community

Rental-Yield-Prediction / app.py

nightey3s

Upload

ac3d16c unverified 5 months ago

raw

history blame contribute delete

34 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import os
	import json
	import plotly.express as px
	import plotly.graph_objects as go
	import time
	import matplotlib.pyplot as plt
	from sklearn.ensemble import RandomForestRegressor
	from src.data_processor import DataProcessor
	from src.predictor import RentalPredictor

	# Set page configuration
	st.set_page_config(
	page_title="Singapore Rental Prediction",
	page_icon="🏢",
	layout="wide"
	)

	# Function to load a model with enhanced error handling
	def load_model():
	"""Load LSTM model from disk and return the predictor with improved error handling"""
	model_type = "lstm" # Always load LSTM
	try:
	st.info(f"Attempting to load {model_type.upper()} model...")
	# Check if model file exists
	model_file = os.path.join("models", f"{model_type}_model.keras")
	if not os.path.exists(model_file):
	st.error(f"Model file not found: {model_file}")
	return None

	# Check if preprocessor exists
	preprocessor_path = os.path.join("models", "data_processor.pkl")
	if not os.path.exists(preprocessor_path):
	st.error(f"Preprocessor not found: {preprocessor_path}")
	return None

	# Create predictor
	predictor = RentalPredictor()

	# Load preprocessor
	st.info("Loading preprocessor...")
	success_preprocessor = predictor.load_preprocessor(preprocessor_path)
	if not success_preprocessor:
	st.error(f"Failed to load preprocessor from {preprocessor_path}")
	return None

	# Load model with explicit path
	st.info(f"Loading model from {model_file}...")
	success_model = predictor.load_from_file(model_type)
	if not success_model:
	st.error(f"Failed to load {model_type} model")
	return None

	return predictor

	except Exception as e:
	st.error(f"Error loading {model_type} model: {str(e)}")
	import traceback
	st.code(traceback.format_exc())
	return None

	# Function to load data
	@st.cache_data
	def load_data(file_path):
	try:
	data = pd.read_csv(file_path)
	return data
	except Exception as e:
	st.error(f"Error loading data: {e}")
	return None

	# Function to make prediction for multiple combinations
	def predict_rent_multiple(towns, flat_types):
	"""Make rental predictions for multiple town and flat_type combinations"""
	results = []

	if 'lstm' not in st.session_state.predictors:
	return results

	predictor = st.session_state.predictors['lstm']

	# Use default values for block and street_name
	block = "123"
	street_name = "SAMPLE STREET"

	# Make prediction for each combination
	for town in towns:
	for flat_type in flat_types:
	try:
	input_features = {
	'town': town,
	'block': block,
	'street_name': street_name,
	'flat_type': flat_type
	}
	prediction = predictor.predict(input_features)
	results.append({
	'town': town,
	'flat_type': flat_type,
	'prediction': prediction
	})
	except Exception as e:
	st.error(f"Error predicting for {town}, {flat_type}: {e}")

	return results

	# Prediction Page
	def predict_rent_time_series_multiple(towns, flat_types, months_range=12):
	"""
	Make predictions for multiple town/flat_type combinations over time

	Args:
	towns: List of towns to predict for
	flat_types: List of flat types to predict for
	months_range: Number of months before and after (default 12)

	Returns:
	Dictionary with predictions for each combo across time
	"""
	# Get current date as reference point
	from datetime import datetime, timedelta
	current_date = datetime.now()

	# Generate list of dates for prediction
	dates = []
	for i in range(-months_range, months_range + 1):
	# Calculate target date
	target_date = current_date + timedelta(days=i*30) # Approximate month
	dates.append({
	'year': target_date.year,
	'month': target_date.month,
	'label': target_date.strftime('%Y-%m')
	})

	# Initialize results
	time_series_results = {
	'dates': [d['label'] for d in dates],
	'predictions': {}
	}

	# Use default values for block and street_name
	block = "123"
	street_name = "SAMPLE STREET"

	# Only make predictions if LSTM is loaded
	if 'lstm' in st.session_state.predictors:
	predictor = st.session_state.predictors['lstm']

	# For each combination
	for town in towns:
	for flat_type in flat_types:
	combo_key = f"{town} - {flat_type}"
	predictions = []

	for date in dates:
	# Copy input features and add temporal information
	date_input = {
	'town': town,
	'block': block,
	'street_name': street_name,
	'flat_type': flat_type,
	'year': date['year'],
	'month': date['month']
	}

	try:
	prediction = predictor.predict(date_input)
	predictions.append(prediction)
	except Exception as e:
	st.error(f"Error with LSTM prediction for {combo_key}, {date['label']}: {e}")
	predictions.append(None)

	time_series_results['predictions'][combo_key] = predictions

	return time_series_results

	# Function to calculate PICP (Prediction Interval Coverage Probability)
	def calculate_picp(y_true, y_pred_lower, y_pred_upper):
	"""Calculate percentage of true values that fall within prediction interval"""
	within_interval = np.logical_and(y_true >= y_pred_lower, y_true <= y_pred_upper)
	return np.mean(within_interval) * 100

	# Function to get model metrics
	def get_model_metrics():
	"""Get metrics for the LSTM model from metrics_log.json and lstm_info.json"""
	model_type = "lstm"
	metrics_sources = {}

	try:
	# First try lstm_cleaned_data_info.json (most recent training with cleaned data)
	cleaned_info_file = os.path.join("models", "lstm_cleaned_data_info.json")
	if os.path.exists(cleaned_info_file):
	with open(cleaned_info_file, 'r') as f:
	cleaned_info = json.load(f)
	metrics_sources['lstm_cleaned_data_info'] = cleaned_info.get('metrics', {})
	print(f"Loaded metrics from lstm_cleaned_data_info.json: {metrics_sources['lstm_cleaned_data_info']}")

	# Try metrics_log.json (training metrics)
	metrics_file = os.path.join("models", "metrics_log.json")
	if os.path.exists(metrics_file):
	with open(metrics_file, 'r') as f:
	metrics_log = json.load(f)

	# Find the most recent LSTM metrics
	lstm_entries = [entry for entry in metrics_log if entry['model_type'] == model_type]
	if lstm_entries:
	# Get the most recent entry
	latest_entry = max(lstm_entries, key=lambda x: x.get('timestamp', ''))
	metrics_sources['metrics_log'] = latest_entry.get('metrics', {})
	print(f"Loaded metrics from metrics_log.json: {metrics_sources['metrics_log']}")

	# Then try lstm_info.json (evaluation metrics)
	model_info_file = os.path.join("models", f"{model_type}_info.json")
	if os.path.exists(model_info_file):
	with open(model_info_file, 'r') as f:
	model_info = json.load(f)
	metrics_sources['lstm_info'] = model_info.get('metrics', {})
	print(f"Loaded metrics from lstm_info.json: {metrics_sources['lstm_info']}")

	print(f"All metrics sources: {list(metrics_sources.keys())}")
	return metrics_sources

	except Exception as e:
	st.error(f"Error loading metrics: {e}")
	import traceback
	st.code(traceback.format_exc())

	return {}

	# Function to get top features from model
	def get_top_features(data, n_features=3):
	"""Get top n features that influence the LSTM predictions using a surrogate model"""
	try:
	# Train a simple Random Forest as a surrogate model
	X = data.drop('monthly_rent', axis=1)
	# Drop temporal features to get base features
	if 'month' in X.columns:
	X = X.drop(['month', 'year'], axis=1)

	# Handle categorical features
	X_encoded = pd.get_dummies(X)

	# Train surrogate model
	surrogate_model = RandomForestRegressor(n_estimators=100, random_state=42)
	surrogate_model.fit(X_encoded, data['monthly_rent'])

	# Get feature importances from surrogate model
	importances = surrogate_model.feature_importances_
	feature_names = X_encoded.columns

	# Sort and get top features
	indices = np.argsort(importances)[::-1]
	top_indices = indices[:n_features]

	# Return top feature names and importances
	return [(feature_names[i], importances[i]) for i in top_indices]
	except Exception as e:
	st.error(f"Error getting top features: {e}")

	return []

	# Initialize session state variables if they don't exist
	if 'predictors' not in st.session_state:
	st.session_state.predictors = {}
	if 'model_loaded' not in st.session_state:
	st.session_state.model_loaded = False
	if 'current_page' not in st.session_state:
	st.session_state.current_page = "Predict"

	# Application title
	st.title("Singapore Property Rental Prediction (LSTM Model)")

	# Sidebar for navigation
	st.sidebar.title("Navigation")
	page = st.sidebar.radio(
	"Select Page",
	["Predict", "Model Statistics"],
	index=0 if st.session_state.current_page == "Predict" else 1
	)
	st.session_state.current_page = page

	# Model status indicator in sidebar
	st.sidebar.markdown("---")
	st.sidebar.markdown("### Model Status")

	# Auto-load LSTM model
	if not st.session_state.model_loaded:
	with st.spinner("🔄 Loading LSTM model... Please wait..."):
	predictor = load_model()

	if predictor:
	# Store in session state
	st.session_state.predictors["lstm"] = predictor
	st.session_state.model_loaded = True
	st.sidebar.success("✅ LSTM model loaded successfully!")
	else:
	st.sidebar.error("❌ Failed to load LSTM model.")

	# Show model status in sidebar
	if st.session_state.model_loaded:
	st.sidebar.success("✅ LSTM model loaded and ready")
	else:
	st.sidebar.warning("⚠️ LSTM model not loaded")

	# Footer in sidebar
	st.sidebar.markdown("---")
	st.sidebar.markdown("### About")
	st.sidebar.info(
	"This application predicts rental prices for properties in Singapore "
	"using an LSTM deep learning model trained on historical rental data."
	)

	# Handle page selection
	if page == "Predict":
	# Main prediction interface
	st.header("Predict Monthly Rent")

	# Check if model is loaded
	if not st.session_state.model_loaded:
	st.warning("LSTM model is not loaded. Please check the error messages and refresh the page.")
	else:
	# Input form
	with st.form("prediction_form"):
	# Get available towns and flat types from the data
	data_path = "data/RentingOutofFlats2025.csv"
	if os.path.exists(data_path):
	data = load_data(data_path)
	towns = sorted(data['town'].unique())
	flat_types = sorted(data['flat_type'].unique())
	else:
	towns = ["ANG MO KIO", "BEDOK", "BISHAN", "BUKIT BATOK", "BUKIT MERAH", "BUKIT PANJANG",
	"BUKIT TIMAH", "CENTRAL AREA", "CHOA CHU KANG", "CLEMENTI", "GEYLANG", "HOUGANG",
	"JURONG EAST", "JURONG WEST", "KALLANG/WHAMPOA", "MARINE PARADE", "PASIR RIS",
	"PUNGGOL", "QUEENSTOWN", "SEMBAWANG", "SENGKANG", "SERANGOON", "TAMPINES",
	"TOA PAYOH", "WOODLANDS", "YISHUN"]
	flat_types = ["1 ROOM", "2 ROOM", "3 ROOM", "4 ROOM", "5 ROOM", "EXECUTIVE", "MULTI-GENERATION"]

	# Multi-select for towns (limit to 3)
	selected_towns = st.multiselect(
	"Select Towns (up to 3)",
	towns,
	default=[towns[0]],
	max_selections=3
	)

	# Multi-select for flat types (limit to 3)
	selected_flat_types = st.multiselect(
	"Select Flat Types (up to 3)",
	flat_types,
	default=[flat_types[2]], # Default to 3 ROOM
	max_selections=3
	)

	# Help text
	st.markdown("Note: Block number and street name are fixed to standard values")

	submit_button = st.form_submit_button("Predict Rent")

	if submit_button:
	# Validate selections
	if not selected_towns:
	st.error("Please select at least one town")
	elif not selected_flat_types:
	st.error("Please select at least one flat type")
	else:
	# Make predictions for all combinations
	with st.spinner("Calculating predictions..."):
	results = predict_rent_multiple(selected_towns, selected_flat_types)

	if results:
	# Display current predictions in a table
	st.subheader("Current Monthly Rent Predictions")

	# Create DataFrame for display
	results_df = pd.DataFrame(results)

	# Format the prediction column
	results_df['prediction'] = results_df['prediction'].apply(lambda x: f"S${x:.2f}")

	# Reshape for better display - pivot table with towns as rows and flat types as columns
	pivot_df = results_df.pivot(index='town', columns='flat_type', values='prediction')

	# Display the pivot table
	st.dataframe(pivot_df, use_container_width=True)

	# Generate time series prediction automatically
	st.subheader("Rental Price Forecast (±12 Months)")

	# Make time series predictions
	with st.spinner("Generating time series forecast..."):
	time_series_data = predict_rent_time_series_multiple(selected_towns, selected_flat_types)

	if time_series_data['predictions']:
	# Create line chart for time series predictions
	fig = go.Figure()

	reference_point = len(time_series_data['dates']) // 2

	# Create a color palette with enough distinct colors
	colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd",
	"#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]

	# Calculate min/max values for y-axis range
	all_values = []

	# Create a DataFrame for tabular display
	table_data = {"Date": time_series_data['dates']}

	# Add each combination as a line on the plot
	color_index = 0
	for combo_key, predictions in time_series_data['predictions'].items():
	if any(p is not None for p in predictions):
	# Add to plot
	fig.add_trace(go.Scatter(
	x=time_series_data['dates'],
	y=predictions,
	mode='lines+markers',
	name=combo_key,
	line=dict(color=colors[color_index % len(colors)], width=2),
	hoverinfo='text',
	hovertext=[f"{combo_key}, {date}: S${pred:.2f}" for date, pred in
	zip(time_series_data['dates'], predictions)]
	))

	# Add to table data
	table_data[combo_key] = [f"S${pred:.2f}" if pred is not None else "N/A"
	for pred in predictions]

	# Add to values list for y-axis range calculation
	all_values.extend([v for v in predictions if v is not None])

	# Increment color index
	color_index += 1

	# Add a vertical line for current date
	fig.add_shape(
	type="line",
	x0=time_series_data['dates'][reference_point],
	y0=min(all_values) * 0.95 if all_values else 0,
	x1=time_series_data['dates'][reference_point],
	y1=max(all_values) * 1.05 if all_values else 0,
	line=dict(color="black", width=1, dash="dash"),
	)

	# Set y-axis range to not start from zero
	y_min = min(all_values) * 0.95 if all_values else 0
	y_max = max(all_values) * 1.05 if all_values else 0

	fig.update_layout(
	title="Rental Price Forecast Over Time (LSTM Model)",
	xaxis_title="Date",
	yaxis_title="Predicted Rent (S$)",
	hovermode="closest",
	height=600, # Increased height for better visibility
	yaxis=dict(range=[y_min, y_max]),
	legend=dict(
	orientation="h",
	yanchor="bottom",
	y=1.02,
	xanchor="center",
	x=0.5
	)
	)

	# Add zone highlighting
	# Past zone (light gray)
	fig.add_shape(
	type="rect",
	xref="x",
	yref="y",
	x0=time_series_data['dates'][0],
	y0=y_min,
	x1=time_series_data['dates'][reference_point-1],
	y1=y_max,
	fillcolor="lightgray",
	opacity=0.2,
	layer="below",
	line_width=0,
	)

	# Current zone (light green)
	fig.add_shape(
	type="rect",
	xref="x",
	yref="y",
	x0=time_series_data['dates'][reference_point],
	y0=y_min,
	x1=time_series_data['dates'][reference_point],
	y1=y_max,
	fillcolor="rgb(0, 255, 0)",
	opacity=0.3,
	layer="below",
	line_width=0,
	)

	# Future zone (light blue)
	fig.add_shape(
	type="rect",
	xref="x",
	yref="y",
	x0=time_series_data['dates'][reference_point+1],
	y0=y_min,
	x1=time_series_data['dates'][-1],
	y1=y_max,
	fillcolor="lightblue",
	opacity=0.2,
	layer="below",
	line_width=0,
	)

	# Add annotations for zones
	fig.add_annotation(
	x=time_series_data['dates'][reference_point//2],
	y=y_max*0.95,
	text="Historical Data",
	showarrow=False,
	font=dict(size=12)
	)

	fig.add_annotation(
	x=time_series_data['dates'][reference_point],
	y=y_max*0.95,
	text="Current",
	showarrow=False,
	font=dict(size=12)
	)

	fig.add_annotation(
	x=time_series_data['dates'][reference_point + (len(time_series_data['dates'])-reference_point)//2],
	y=y_max*0.95,
	text="Forecast",
	showarrow=False,
	font=dict(size=12)
	)

	st.plotly_chart(fig, use_container_width=True)

	# Display tabular data
	st.subheader("Monthly Forecast Values")
	forecast_df = pd.DataFrame(table_data)

	# Highlight current month row
	def highlight_current_month(row):
	if row.name == reference_point:
	return ['background-color: #0A0'] * len(row)
	return [''] * len(row)

	# Display the table with styling
	st.dataframe(
	forecast_df.style.apply(highlight_current_month, axis=1),
	use_container_width=True
	)

	# Add an explanation of the forecast
	with st.expander("About this forecast"):
	st.markdown("""
	This forecast shows predicted rental prices for the selected properties over time (12 months
	before and after the current date). The LSTM model uses historical patterns to predict
	future rental trends and shows what the rental price likely was historically.

	* The vertical dashed line represents the current month
	* Past predictions show what the rental price likely was historically (gray area)
	* Future predictions show expected rental price trends (blue area)
	* The current month is highlighted in green
	* You can compare up to 9 different town/flat type combinations
	""")
	else:
	st.warning("Unable to generate time series forecast. Please check if the LSTM model is loaded correctly.")

	elif page == "Model Statistics":
	st.header("LSTM Model Performance Statistics")

	# Check if LSTM model is loaded
	if not st.session_state.model_loaded or 'lstm' not in st.session_state.predictors:
	st.warning("LSTM model is not loaded. Please go to the Predict page to load the model first.")
	else:
	# Load data for feature analysis
	data_path = "data/RentingOutofFlats2025.csv"
	if os.path.exists(data_path):
	data = load_data(data_path)

	# Get metrics for LSTM model from all sources
	metrics_sources = get_model_metrics()

	# Debug information
	with st.expander("Debug: Available Metrics Files"):
	st.write("Available metrics sources:", list(metrics_sources.keys()))
	for source, metrics in metrics_sources.items():
	st.write(f"{source}:", metrics)

	# Add tabs to show metrics from different sources
	available_tabs = []
	if 'lstm_cleaned_data_info' in metrics_sources:
	available_tabs.append("Latest Evaluation")
	if 'lstm_info' in metrics_sources:
	available_tabs.append("Standard Evaluation")
	if 'metrics_log' in metrics_sources:
	available_tabs.append("Training Metrics")
	available_tabs.append("Comparison")

	tabs = st.tabs(available_tabs)
	tab_index = 0

	# Latest evaluation metrics tab (from lstm_cleaned_data_info.json)
	if 'lstm_cleaned_data_info' in metrics_sources:
	with tabs[tab_index]:
	metrics = metrics_sources['lstm_cleaned_data_info']
	st.subheader("Latest Evaluation Metrics (Cleaned Data)")

	if metrics:
	col1, col2, col3 = st.columns(3)
	with col1:
	rmse_val = metrics.get('rmse', 0)
	st.metric("RMSE", f"{rmse_val:.2f}" if rmse_val > 0 else "N/A")
	with col2:
	mape_val = metrics.get('mape', 0)
	st.metric("MAPE", f"{mape_val:.2f}%" if mape_val > 0 else "N/A")
	with col3:
	r2_val = metrics.get('r2', 0)
	st.metric("R² Score", f"{r2_val:.4f}" if r2_val != 0 else "N/A")

	col1, col2 = st.columns(2)
	with col1:
	picp_val = metrics.get('picp', 0)
	st.metric("PICP", f"{picp_val*100:.1f}%" if picp_val > 0 else "N/A")
	with col2:
	mpiw_val = metrics.get('mpiw', 0)
	st.metric("MPIW", f"{mpiw_val:.2f}" if mpiw_val > 0 else "N/A")
	else:
	st.warning("No metrics found in latest evaluation file")

	tab_index += 1

	# Standard evaluation metrics tab (from lstm_info.json)
	if 'lstm_info' in metrics_sources:
	with tabs[tab_index]:
	metrics = metrics_sources['lstm_info']
	st.subheader("Standard Evaluation Metrics")

	if metrics:
	col1, col2, col3 = st.columns(3)
	with col1:
	rmse_val = metrics.get('rmse', 0)
	st.metric("RMSE", f"{rmse_val:.2f}" if rmse_val > 0 else "N/A")
	with col2:
	mape_val = metrics.get('mape', 0)
	st.metric("MAPE", f"{mape_val:.2f}%" if mape_val > 0 else "N/A")
	with col3:
	r2_val = metrics.get('r2', 0)
	st.metric("R² Score", f"{r2_val:.4f}" if r2_val != 0 else "N/A")

	col1, col2 = st.columns(2)
	with col1:
	picp_val = metrics.get('picp', 0)
	st.metric("PICP", f"{picp_val*100:.1f}%" if picp_val > 0 else "N/A")
	with col2:
	mpiw_val = metrics.get('mpiw', 0)
	st.metric("MPIW", f"{mpiw_val:.2f}" if mpiw_val > 0 else "N/A")
	else:
	st.warning("No metrics found in standard evaluation file")

	tab_index += 1

	# Training metrics tab (from metrics_log.json)
	if 'metrics_log' in metrics_sources:
	with tabs[tab_index]:
	metrics = metrics_sources['metrics_log']
	st.subheader("Training Metrics")

	if metrics:
	col1, col2, col3 = st.columns(3)
	with col1:
	rmse_val = metrics.get('rmse', 0)
	st.metric("RMSE", f"{rmse_val:.2f}" if rmse_val > 0 else "N/A")
	with col2:
	mape_val = metrics.get('mape', 0)
	st.metric("MAPE", f"{mape_val:.2f}%" if mape_val > 0 else "N/A")
	with col3:
	mae_val = metrics.get('mae', 0)
	st.metric("MAE", f"{mae_val:.2f}" if mae_val > 0 else "N/A")

	r2_val = metrics.get('r2', 0)
	st.metric("R² Score", f"{r2_val:.4f}" if r2_val != 0 else "N/A")

	if 'mse' in metrics:
	mse_val = metrics.get('mse', 0)
	st.metric("MSE", f"{mse_val:.2f}" if mse_val > 0 else "N/A")
	else:
	st.warning("No training metrics found")

	tab_index += 1

	# Comparison tab
	if len(available_tabs) > 1:
	with tabs[-1]: # Last tab is always comparison
	st.subheader("Understanding the Metrics")

	if not metrics_sources:
	st.warning("No metrics files found. Please train the model first.")
	else:
	# Create comparison table if we have multiple sources
	comparison_data = []

	for source_name, metrics in metrics_sources.items():
	if metrics:
	comparison_data.append({
	'Source': source_name,
	'RMSE': f"{metrics.get('rmse', 0):.2f}" if metrics.get('rmse', 0) > 0 else "N/A",
	'MAPE': f"{metrics.get('mape', 0):.2f}%" if metrics.get('mape', 0) > 0 else "N/A",
	'R²': f"{metrics.get('r2', 0):.4f}" if metrics.get('r2', 0) != 0 else "N/A",
	'PICP': f"{metrics.get('picp', 0)*100:.1f}%" if metrics.get('picp', 0) > 0 else "N/A",
	'MPIW': f"{metrics.get('mpiw', 0):.2f}" if metrics.get('mpiw', 0) > 0 else "N/A"
	})

	if comparison_data:
	st.subheader("Metrics Comparison")
	st.table(pd.DataFrame(comparison_data))

	st.markdown("""
	### Metric Explanations:
	- RMSE: Root Mean Square Error - Lower is better
	- MAPE: Mean Absolute Percentage Error - Lower is better
	- R²: Coefficient of determination - Higher is better (max 1.0)
	- PICP: Prediction Interval Coverage Probability - Should be around 95%
	- MPIW: Mean Prediction Interval Width - Lower is better

	### File Sources:
	- lstm_cleaned_data_info.json: Latest training with cleaned data
	- lstm_info.json: Standard model evaluation
	- metrics_log.json: Training process metrics
	""")