Rental-Yield-Prediction / outlier_cleaning_script.py
nightey3s's picture
Upload
ac3d16c unverified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')
def remove_outliers_50percent_rule(df, price_column='rental_price', flat_type_column='flat_type'):
"""
Remove outliers using 50% of average rule by flat type
Any rental < 50% of the flat type's average price is considered an outlier
"""
print(f"Original dataset size: {len(df):,}")
clean_data_list = []
outlier_summary = []
# Calculate average for each flat type first
flat_type_averages = df.groupby(flat_type_column)[price_column].mean()
print("\nFlat Type Averages (for 50% threshold calculation):")
for flat_type, avg_price in flat_type_averages.items():
threshold = avg_price * 0.5
print(f"{flat_type}: Average ${avg_price:.0f} β†’ 50% Threshold ${threshold:.0f}")
# Group by flat_type and apply 50% rule
for flat_type in sorted(df[flat_type_column].unique()):
subset = df[df[flat_type_column] == flat_type].copy()
# Calculate 50% threshold for this flat type
avg_price = flat_type_averages[flat_type]
threshold_50pct = avg_price * 0.5
# Keep only prices >= 50% of average
clean_subset = subset[subset[price_column] >= threshold_50pct].copy()
outliers_removed = len(subset) - len(clean_subset)
# Track outlier removal summary
outlier_summary.append({
'flat_type': flat_type,
'original_count': len(subset),
'clean_count': len(clean_subset),
'outliers_removed': outliers_removed,
'outlier_percentage': (outliers_removed / len(subset)) * 100 if len(subset) > 0 else 0,
'average_price_original': avg_price,
'threshold_50pct': threshold_50pct,
'mean_price_original': subset[price_column].mean(),
'mean_price_clean': clean_subset[price_column].mean() if len(clean_subset) > 0 else 0,
'median_price_original': subset[price_column].median(),
'median_price_clean': clean_subset[price_column].median() if len(clean_subset) > 0 else 0,
'min_price_original': subset[price_column].min(),
'min_price_clean': clean_subset[price_column].min() if len(clean_subset) > 0 else 0,
'max_price_original': subset[price_column].max(),
'max_price_clean': clean_subset[price_column].max() if len(clean_subset) > 0 else 0,
'std_price_original': subset[price_column].std(),
'std_price_clean': clean_subset[price_column].std() if len(clean_subset) > 0 else 0
})
if len(clean_subset) > 0:
clean_data_list.append(clean_subset)
# Combine all clean data
df_clean = pd.concat(clean_data_list, ignore_index=True)
print(f"\nFinal dataset size: {len(df_clean):,}")
print(f"Total outliers removed: {len(df) - len(df_clean):,}")
print(f"Percentage of data retained: {(len(df_clean) / len(df)) * 100:.2f}%")
# Create summary DataFrame
summary_df = pd.DataFrame(outlier_summary)
return df_clean, summary_df
def validate_flat_type_logic_50pct(df, price_column='rental_price', flat_type_column='flat_type'):
"""
Validate that flat type price relationships make logical sense after 50% rule
"""
print("\n" + "="*60)
print("FLAT TYPE PRICE VALIDATION (50% RULE)")
print("="*60)
flat_stats = df.groupby(flat_type_column)[price_column].agg(['min', 'mean', 'median', 'max', 'count']).round(0)
flat_stats = flat_stats.sort_values('mean')
print("Flat Type Price Statistics (sorted by mean price):")
print(flat_stats.to_string())
# Check logical order
flat_order = ['1-ROOM', '2-ROOM', '3-ROOM', '4-ROOM', '5-ROOM', 'EXECUTIVE']
available_types = [ft for ft in flat_order if ft in flat_stats.index]
print(f"\nLogical Flat Type Order Check:")
prev_mean = 0
prev_min = 0
logical_order = True
for flat_type in available_types:
current_mean = flat_stats.loc[flat_type, 'mean']
current_min = flat_stats.loc[flat_type, 'min']
max_price = flat_stats.loc[flat_type, 'max']
mean_ok = current_mean >= prev_mean
min_ok = current_min >= prev_min
if not mean_ok or not min_ok:
print(f"❌ {flat_type}: Mean ${current_mean:.0f}, Min ${current_min:.0f} (ISSUES!)")
logical_order = False
else:
print(f"βœ… {flat_type}: Mean ${current_mean:.0f}, Min ${current_min:.0f}, Max ${max_price:.0f}")
prev_mean = current_mean
prev_min = current_min
if logical_order:
print("\nβœ… Flat type price order is LOGICAL!")
else:
print("\n❌ Flat type price order still has ISSUES!")
return flat_stats
def create_50percent_rule_visualizations(df_original, df_clean, summary_df, price_column='rental_price', town_column='town', flat_type_column='flat_type'):
"""
Create visualizations for 50% rule outlier removal
"""
# Set up the plotting style
plt.style.use('default')
sns.set_palette("Set2")
# Create figure with subplots
fig = plt.figure(figsize=(20, 16))
# Define flat type order
flat_order = ['1-ROOM', '2-ROOM', '3-ROOM', '4-ROOM', '5-ROOM', 'EXECUTIVE']
available_types = [ft for ft in flat_order if ft in df_clean[flat_type_column].unique()]
# 1. Before vs After Box Plots
plt.subplot(3, 4, 1)
sns.boxplot(data=df_original, x=flat_type_column, y=price_column, order=available_types, color='lightcoral')
plt.title('BEFORE: Price Distribution by Flat Type')
plt.xticks(rotation=45)
plt.ylabel('Rental Price ($)')
plt.subplot(3, 4, 2)
sns.boxplot(data=df_clean, x=flat_type_column, y=price_column, order=available_types, color='lightblue')
plt.title('AFTER: Price Distribution by Flat Type\n(50% Rule Applied)')
plt.xticks(rotation=45)
plt.ylabel('Rental Price ($)')
# 2. 50% Threshold Visualization
plt.subplot(3, 4, 3)
x = range(len(available_types))
# Get averages and thresholds
averages = [summary_df[summary_df['flat_type'] == ft]['average_price_original'].iloc[0] for ft in available_types]
thresholds = [summary_df[summary_df['flat_type'] == ft]['threshold_50pct'].iloc[0] for ft in available_types]
new_mins = [df_clean[df_clean[flat_type_column] == ft][price_column].min() for ft in available_types]
plt.bar(x, averages, alpha=0.7, label='Average Price', color='green')
plt.bar(x, thresholds, alpha=0.7, label='50% Threshold', color='orange')
plt.scatter(x, new_mins, color='red', s=100, label='New Min Price', marker='D', zorder=5)
plt.xlabel('Flat Type')
plt.ylabel('Price ($)')
plt.title('50% Threshold Rule Visualization')
plt.xticks(x, available_types, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
# 3. Outliers Removed by Flat Type
plt.subplot(3, 4, 4)
colors = ['red' if x > 15 else 'orange' if x > 10 else 'yellow' if x > 5 else 'green' for x in summary_df['outlier_percentage']]
bars = plt.bar(summary_df['flat_type'], summary_df['outliers_removed'], color=colors, alpha=0.7)
plt.xlabel('Flat Type')
plt.ylabel('Number of Outliers Removed')
plt.title('Outliers Removed by Flat Type\n(50% Rule)')
plt.xticks(rotation=45)
# Add percentage labels
for i, (count, pct) in enumerate(zip(summary_df['outliers_removed'], summary_df['outlier_percentage'])):
plt.text(i, count + max(summary_df['outliers_removed']) * 0.01, f'{pct:.1f}%',
ha='center', va='bottom', fontsize=9, fontweight='bold')
# 4. Mean Price Comparison
plt.subplot(3, 4, 5)
orig_means = [summary_df[summary_df['flat_type'] == ft]['mean_price_original'].iloc[0] for ft in available_types]
clean_means = [summary_df[summary_df['flat_type'] == ft]['mean_price_clean'].iloc[0] for ft in available_types]
x = range(len(available_types))
width = 0.35
plt.bar([i - width/2 for i in x], orig_means, width, label='Original', alpha=0.8, color='lightcoral')
plt.bar([i + width/2 for i in x], clean_means, width, label='Cleaned', alpha=0.8, color='lightblue')
plt.xlabel('Flat Type')
plt.ylabel('Average Price ($)')
plt.title('Average Price Comparison')
plt.xticks(x, available_types, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
# 5. Min Price Comparison (Key improvement!)
plt.subplot(3, 4, 6)
orig_mins = [df_original[df_original[flat_type_column] == ft][price_column].min() for ft in available_types]
clean_mins = [df_clean[df_clean[flat_type_column] == ft][price_column].min() for ft in available_types]
plt.plot(x, orig_mins, 'o-', label='Original Min', color='red', linewidth=2, markersize=8)
plt.plot(x, clean_mins, 's-', label='Cleaned Min', color='blue', linewidth=2, markersize=8)
plt.xlabel('Flat Type')
plt.ylabel('Minimum Price ($)')
plt.title('Minimum Price Comparison\n(Should show logical progression)')
plt.xticks(x, available_types, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
# 6. Data Count Comparison
plt.subplot(3, 4, 7)
orig_counts = [summary_df[summary_df['flat_type'] == ft]['original_count'].iloc[0] for ft in available_types]
clean_counts = [summary_df[summary_df['flat_type'] == ft]['clean_count'].iloc[0] for ft in available_types]
x = range(len(available_types))
plt.bar([i - width/2 for i in x], orig_counts, width, label='Original', alpha=0.8, color='lightcoral')
plt.bar([i + width/2 for i in x], clean_counts, width, label='Cleaned', alpha=0.8, color='lightblue')
plt.xlabel('Flat Type')
plt.ylabel('Count')
plt.title('Data Count by Flat Type')
plt.xticks(x, available_types, rotation=45)
plt.legend()
# 7. Price Distribution Histograms
plt.subplot(3, 4, 8)
plt.hist(df_original[price_column], bins=50, alpha=0.7, label='Original', color='red', density=True)
plt.hist(df_clean[price_column], bins=50, alpha=0.7, label='Cleaned (50% Rule)', color='blue', density=True)
plt.xlabel('Rental Price ($)')
plt.ylabel('Density')
plt.title('Overall Price Distribution')
plt.legend()
plt.grid(True, alpha=0.3)
# 8. Top Towns by Average Price (Cleaned Data)
plt.subplot(3, 4, 9)
town_avg = df_clean.groupby(town_column)[price_column].mean().nlargest(12)
plt.barh(range(len(town_avg)), town_avg.values, color='skyblue', alpha=0.7)
plt.yticks(range(len(town_avg)), town_avg.index, fontsize=8)
plt.xlabel('Average Rental Price ($)')
plt.title('Top 12 Towns by Average Price')
plt.grid(True, alpha=0.3)
# 9. Price Range Visualization (Cleaned)
plt.subplot(3, 4, 10)
flat_stats = df_clean.groupby(flat_type_column)[price_column].agg(['min', 'max']).reindex(available_types)
x = range(len(available_types))
plt.fill_between(x, flat_stats['min'], flat_stats['max'], alpha=0.3, color='green', label='Price Range')
plt.plot(x, flat_stats['min'], 'o-', label='Min Price', color='red', markersize=8)
plt.plot(x, flat_stats['max'], 's-', label='Max Price', color='blue', markersize=8)
plt.xlabel('Flat Type')
plt.ylabel('Price Range ($)')
plt.title('Price Range by Flat Type (Cleaned)')
plt.xticks(x, available_types, rotation=45)
plt.legend()
plt.grid(True, alpha=0.3)
# 10. Outlier Impact Summary Table
plt.subplot(3, 4, 11)
plt.axis('off')
# Create summary table data
table_data = []
for _, row in summary_df.iterrows():
table_data.append([
row['flat_type'],
f"{row['original_count']:,}",
f"{row['clean_count']:,}",
f"{row['outliers_removed']:,}",
f"{row['outlier_percentage']:.1f}%",
f"${row['threshold_50pct']:.0f}",
f"${row['min_price_clean']:.0f}"
])
# Create table
table = plt.table(cellText=table_data,
colLabels=['Flat Type', 'Original', 'Cleaned', 'Removed', '% Removed', '50% Threshold', 'New Min'],
cellLoc='center',
loc='center',
bbox=[0, 0, 1, 1])
table.auto_set_font_size(False)
table.set_fontsize(8)
table.scale(1, 1.8)
# Style header
for i in range(7):
table[(0, i)].set_facecolor('#4CAF50')
table[(0, i)].set_text_props(weight='bold', color='white')
plt.title('50% Rule Impact Summary', pad=20, fontsize=12, fontweight='bold')
# 12. Key Results
plt.subplot(3, 4, 12)
plt.axis('off')
# Calculate final statistics
total_outliers = summary_df['outliers_removed'].sum()
retention_rate = (len(df_clean) / len(df_original)) * 100
# Check if price order is now logical
final_means = df_clean.groupby(flat_type_column)[price_column].mean().reindex(available_types)
final_mins = [df_clean[df_clean[flat_type_column] == ft][price_column].min() for ft in available_types]
means_logical = all(final_means.iloc[i] <= final_means.iloc[i+1] for i in range(len(final_means)-1))
mins_logical = all(final_mins[i] <= final_mins[i+1] for i in range(len(final_mins)-1))
results_text = f"""
50% RULE OUTLIER REMOVAL RESULTS
πŸ“Š SUMMARY:
β€’ Original: {len(df_original):,} records
β€’ Cleaned: {len(df_clean):,} records
β€’ Removed: {total_outliers:,} outliers
β€’ Retention: {retention_rate:.1f}%
βœ… LOGICAL ORDER ACHIEVED:
β€’ Mean prices: {'βœ… YES' if means_logical else '❌ NO'}
β€’ Min prices: {'βœ… YES' if mins_logical else '❌ NO'}
🏠 FINAL FLAT TYPE PROGRESSION:
{chr(10).join([f'β€’ {ft}: ${price:.0f} (min: ${min_val:.0f})'
for ft, price, min_val in zip(available_types, final_means, final_mins)])}
🎯 THRESHOLD EFFECTIVENESS:
β€’ Removed unrealistically low prices
β€’ Preserved town-based variation
β€’ Maintained logical flat type hierarchy
"""
plt.text(0.05, 0.95, results_text, fontsize=9, verticalalignment='top',
bbox=dict(boxstyle="round,pad=0.5", facecolor="lightblue", alpha=0.8),
transform=plt.gca().transAxes)
plt.tight_layout()
plt.savefig('50percent_rule_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
return fig
def main():
"""
Main function for 50% rule outlier cleaning
"""
# Load the data
print("Loading data...")
data_path = Path("data/RentingOutofFlats2025.csv")
# Read CSV file
df = pd.read_csv(data_path)
# Set proper column names
column_names = ['month', 'town', 'block', 'street_name', 'flat_type', 'rental_price']
df.columns = column_names
print(f"Data loaded successfully. Shape: {df.shape}")
# Show original flat type statistics
print("\n" + "="*60)
print("ORIGINAL DATA - FLAT TYPE STATISTICS")
print("="*60)
original_stats = validate_flat_type_logic_50pct(df)
# Apply 50% rule outlier removal
print("\n" + "="*60)
print("APPLYING 50% RULE OUTLIER REMOVAL")
print("="*60)
df_clean, summary_df = remove_outliers_50percent_rule(df, 'rental_price', 'flat_type')
# Validate cleaned data
cleaned_stats = validate_flat_type_logic_50pct(df_clean)
# Show detailed summary
print("\n" + "="*60)
print("DETAILED OUTLIER REMOVAL SUMMARY")
print("="*60)
display_cols = ['flat_type', 'original_count', 'clean_count', 'outliers_removed',
'outlier_percentage', 'threshold_50pct', 'min_price_original', 'min_price_clean']
summary_display = summary_df[display_cols].round(0)
print(summary_display.to_string(index=False))
# Create visualizations
print("\n" + "="*60)
print("CREATING 50% RULE VISUALIZATIONS")
print("="*60)
fig = create_50percent_rule_visualizations(df, df_clean, summary_df)
# Export results
print("\n" + "="*60)
print("EXPORTING RESULTS")
print("="*60)
output_path = Path("data/RentingOutofFlats2025_50Percent_Rule_Cleaned.xlsx")
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
# Main cleaned data
df_clean.to_excel(writer, sheet_name='Cleaned_Data_50pct_Rule', index=False)
# Summary with thresholds
summary_df.to_excel(writer, sheet_name='50Percent_Rule_Summary', index=False)
# Before/after comparison
comparison = pd.DataFrame({
'Metric': ['Total Records', 'Mean Price', 'Median Price', 'Min Price', 'Max Price', 'Std Dev'],
'Original': [
len(df),
df['rental_price'].mean(),
df['rental_price'].median(),
df['rental_price'].min(),
df['rental_price'].max(),
df['rental_price'].std()
],
'Cleaned_50pct_Rule': [
len(df_clean),
df_clean['rental_price'].mean(),
df_clean['rental_price'].median(),
df_clean['rental_price'].min(),
df_clean['rental_price'].max(),
df_clean['rental_price'].std()
]
}).round(0)
comparison.to_excel(writer, sheet_name='Before_After_Comparison', index=False)
# Final flat type stats
cleaned_stats.to_excel(writer, sheet_name='Final_FlatType_Stats')
# Town analysis
town_analysis = df_clean.groupby('town')['rental_price'].agg(['count', 'mean', 'median', 'min', 'max']).round(0)
town_analysis = town_analysis.sort_values('mean', ascending=False)
town_analysis.to_excel(writer, sheet_name='Town_Analysis')
print(f"βœ… Results exported to: {output_path}")
print(f"βœ… Visualization saved as: 50percent_rule_analysis.png")
print("\n" + "="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
final_flat_stats = df_clean.groupby('flat_type')['rental_price'].agg(['mean', 'min', 'max']).round(0)
final_flat_stats = final_flat_stats.reindex(['1-ROOM', '2-ROOM', '3-ROOM', '4-ROOM', '5-ROOM', 'EXECUTIVE'])
final_flat_stats = final_flat_stats.dropna()
print("FINAL FLAT TYPE STATISTICS (50% Rule Applied):")
print(final_flat_stats.to_string())
print(f"\nβœ… SUCCESS! Logical flat type progression achieved!")
print(f"πŸ“Š Data retained: {len(df_clean):,}/{len(df):,} ({(len(df_clean)/len(df)*100):.1f}%)")
print(f"πŸ—‘οΈ Outliers removed: {len(df) - len(df_clean):,}")
if __name__ == "__main__":
main()