Spaces:
Sleeping
Sleeping
| import os | |
| import torch | |
| import torchaudio | |
| import torchvision | |
| import numpy as np | |
| import time | |
| import json | |
| from torch.utils.data import Dataset, DataLoader | |
| import sys | |
| from tqdm import tqdm | |
| # Add parent directory to path to import the preprocess functions | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from preprocess import process_audio_data, process_image_data | |
| # Print library versions | |
| print(f"\033[92mINFO\033[0m: PyTorch version: {torch.__version__}") | |
| print(f"\033[92mINFO\033[0m: Torchaudio version: {torchaudio.__version__}") | |
| print(f"\033[92mINFO\033[0m: Torchvision version: {torchvision.__version__}") | |
| # Device selection | |
| device = torch.device( | |
| "cuda" | |
| if torch.cuda.is_available() | |
| else "mps" if torch.backends.mps.is_available() else "cpu" | |
| ) | |
| print(f"\033[92mINFO\033[0m: Using device: {device}") | |
| # Hyperparameters | |
| batch_size = 16 | |
| epochs = 1 # Just one epoch for evaluation | |
| learning_rate = 0.0001 | |
| class WatermelonDataset(Dataset): | |
| def __init__(self, data_dir): | |
| self.data_dir = data_dir | |
| self.samples = [] | |
| # Walk through the directory structure | |
| for sweetness_dir in os.listdir(data_dir): | |
| sweetness = float(sweetness_dir) | |
| sweetness_path = os.path.join(data_dir, sweetness_dir) | |
| if os.path.isdir(sweetness_path): | |
| for id_dir in os.listdir(sweetness_path): | |
| id_path = os.path.join(sweetness_path, id_dir) | |
| if os.path.isdir(id_path): | |
| audio_file = os.path.join(id_path, f"{id_dir}.wav") | |
| image_file = os.path.join(id_path, f"{id_dir}.jpg") | |
| if os.path.exists(audio_file) and os.path.exists(image_file): | |
| self.samples.append((audio_file, image_file, sweetness)) | |
| print(f"\033[92mINFO\033[0m: Loaded {len(self.samples)} samples from {data_dir}") | |
| def __len__(self): | |
| return len(self.samples) | |
| def __getitem__(self, idx): | |
| audio_path, image_path, label = self.samples[idx] | |
| # Load and process audio | |
| try: | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| mfcc = process_audio_data(waveform, sample_rate) | |
| # Load and process image | |
| image = torchvision.io.read_image(image_path) | |
| image = image.float() | |
| processed_image = process_image_data(image) | |
| return mfcc, processed_image, torch.tensor(label).float() | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error processing sample {idx}: {e}") | |
| # Return a fallback sample or skip this sample | |
| # For simplicity, we'll return the first sample again | |
| if idx == 0: # Prevent infinite recursion | |
| raise e | |
| return self.__getitem__(0) | |
| # Define available backbone models | |
| IMAGE_BACKBONES = { | |
| "resnet50": { | |
| "model": torchvision.models.resnet50, | |
| "weights": torchvision.models.ResNet50_Weights.DEFAULT, | |
| "output_dim": lambda model: model.fc.in_features | |
| }, | |
| "efficientnet_b0": { | |
| "model": torchvision.models.efficientnet_b0, | |
| "weights": torchvision.models.EfficientNet_B0_Weights.DEFAULT, | |
| "output_dim": lambda model: model.classifier[1].in_features | |
| }, | |
| "efficientnet_b3": { | |
| "model": torchvision.models.efficientnet_b3, | |
| "weights": torchvision.models.EfficientNet_B3_Weights.DEFAULT, | |
| "output_dim": lambda model: model.classifier[1].in_features | |
| } | |
| } | |
| AUDIO_BACKBONES = { | |
| "lstm": { | |
| "model": lambda input_size, hidden_size: torch.nn.LSTM( | |
| input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True | |
| ), | |
| "output_dim": lambda hidden_size: hidden_size | |
| }, | |
| "gru": { | |
| "model": lambda input_size, hidden_size: torch.nn.GRU( | |
| input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True | |
| ), | |
| "output_dim": lambda hidden_size: hidden_size | |
| }, | |
| "bidirectional_lstm": { | |
| "model": lambda input_size, hidden_size: torch.nn.LSTM( | |
| input_size=input_size, hidden_size=hidden_size, num_layers=2, batch_first=True, bidirectional=True | |
| ), | |
| "output_dim": lambda hidden_size: hidden_size * 2 # * 2 because bidirectional | |
| }, | |
| "transformer": { | |
| "model": lambda input_size, hidden_size: torch.nn.TransformerEncoder( | |
| torch.nn.TransformerEncoderLayer( | |
| d_model=input_size, nhead=8, dim_feedforward=hidden_size, batch_first=True | |
| ), | |
| num_layers=2 | |
| ), | |
| "output_dim": lambda hidden_size: 376 # Using input_size (mfcc dimensions) | |
| } | |
| } | |
| class WatermelonModelModular(torch.nn.Module): | |
| def __init__(self, image_backbone_name, audio_backbone_name, audio_hidden_size=128): | |
| super(WatermelonModelModular, self).__init__() | |
| # Audio backbone setup | |
| self.audio_backbone_name = audio_backbone_name | |
| self.audio_hidden_size = audio_hidden_size | |
| self.audio_input_size = 376 # From MFCC dimensions | |
| audio_config = AUDIO_BACKBONES[audio_backbone_name] | |
| self.audio_backbone = audio_config["model"](self.audio_input_size, self.audio_hidden_size) | |
| audio_output_dim = audio_config["output_dim"](self.audio_hidden_size) | |
| self.audio_fc = torch.nn.Linear(audio_output_dim, 128) | |
| # Image backbone setup | |
| self.image_backbone_name = image_backbone_name | |
| image_config = IMAGE_BACKBONES[image_backbone_name] | |
| self.image_backbone = image_config["model"](weights=image_config["weights"]) | |
| # Replace final layer for all image backbones to get features | |
| if image_backbone_name.startswith("resnet"): | |
| self.image_output_dim = image_config["output_dim"](self.image_backbone) | |
| self.image_backbone.fc = torch.nn.Identity() | |
| elif image_backbone_name.startswith("efficientnet"): | |
| self.image_output_dim = image_config["output_dim"](self.image_backbone) | |
| self.image_backbone.classifier = torch.nn.Identity() | |
| elif image_backbone_name.startswith("convnext"): | |
| self.image_output_dim = image_config["output_dim"](self.image_backbone) | |
| self.image_backbone.classifier = torch.nn.Identity() | |
| elif image_backbone_name.startswith("swin"): | |
| self.image_output_dim = image_config["output_dim"](self.image_backbone) | |
| self.image_backbone.head = torch.nn.Identity() | |
| self.image_fc = torch.nn.Linear(self.image_output_dim, 128) | |
| # Fully connected layers for final prediction | |
| self.fc1 = torch.nn.Linear(256, 64) | |
| self.fc2 = torch.nn.Linear(64, 1) | |
| self.relu = torch.nn.ReLU() | |
| def forward(self, mfcc, image): | |
| # Audio backbone processing | |
| if self.audio_backbone_name == "lstm" or self.audio_backbone_name == "gru": | |
| audio_output, _ = self.audio_backbone(mfcc) | |
| audio_output = audio_output[:, -1, :] # Use the output of the last time step | |
| elif self.audio_backbone_name == "bidirectional_lstm": | |
| audio_output, _ = self.audio_backbone(mfcc) | |
| audio_output = audio_output[:, -1, :] # Use the output of the last time step | |
| elif self.audio_backbone_name == "transformer": | |
| audio_output = self.audio_backbone(mfcc) | |
| audio_output = audio_output.mean(dim=1) # Average pooling over sequence length | |
| audio_output = self.audio_fc(audio_output) | |
| # Image backbone processing | |
| image_output = self.image_backbone(image) | |
| image_output = self.image_fc(image_output) | |
| # Concatenate audio and image outputs | |
| merged = torch.cat((audio_output, image_output), dim=1) | |
| # Fully connected layers | |
| output = self.relu(self.fc1(merged)) | |
| output = self.fc2(output) | |
| return output | |
| def evaluate_model(data_dir, image_backbone, audio_backbone, audio_hidden_size=128, save_model_dir=None): | |
| # Adjust batch size based on model complexity to avoid OOM errors | |
| adjusted_batch_size = batch_size | |
| # Models that typically require more memory get smaller batch sizes | |
| if image_backbone in ["swin_b", "convnext_base"] or audio_backbone in ["transformer", "bidirectional_lstm"]: | |
| adjusted_batch_size = max(4, batch_size // 2) # At least batch size of 4, but reduce by half if needed | |
| print(f"\033[92mINFO\033[0m: Adjusted batch size to {adjusted_batch_size} for larger model") | |
| # Create dataset | |
| dataset = WatermelonDataset(data_dir) | |
| n_samples = len(dataset) | |
| # Split dataset | |
| train_size = int(0.7 * n_samples) | |
| val_size = int(0.2 * n_samples) | |
| test_size = n_samples - train_size - val_size | |
| train_dataset, val_dataset, test_dataset = torch.utils.data.random_split( | |
| dataset, [train_size, val_size, test_size] | |
| ) | |
| train_loader = DataLoader(train_dataset, batch_size=adjusted_batch_size, shuffle=True) | |
| val_loader = DataLoader(val_dataset, batch_size=adjusted_batch_size, shuffle=False) | |
| test_loader = DataLoader(test_dataset, batch_size=adjusted_batch_size, shuffle=False) | |
| # Initialize model | |
| model = WatermelonModelModular(image_backbone, audio_backbone, audio_hidden_size).to(device) | |
| # Loss function and optimizer | |
| criterion = torch.nn.MSELoss() | |
| mae_criterion = torch.nn.L1Loss() # For MAE evaluation | |
| optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) | |
| print(f"\033[92mINFO\033[0m: Evaluating model with {image_backbone} (image) and {audio_backbone} (audio)") | |
| print(f"\033[92mINFO\033[0m: Training samples: {len(train_dataset)}") | |
| print(f"\033[92mINFO\033[0m: Validation samples: {len(val_dataset)}") | |
| print(f"\033[92mINFO\033[0m: Test samples: {len(test_dataset)}") | |
| print(f"\033[92mINFO\033[0m: Batch size: {adjusted_batch_size}") | |
| # Training loop | |
| print(f"\033[92mINFO\033[0m: Training for evaluation...") | |
| model.train() | |
| running_loss = 0.0 | |
| # Wrap with tqdm for progress visualization | |
| train_iterator = tqdm(train_loader, desc="Training") | |
| for i, (mfcc, image, label) in enumerate(train_iterator): | |
| try: | |
| mfcc, image, label = mfcc.to(device), image.to(device), label.to(device) | |
| optimizer.zero_grad() | |
| output = model(mfcc, image) | |
| label = label.view(-1, 1).float() | |
| loss = criterion(output, label) | |
| loss.backward() | |
| optimizer.step() | |
| running_loss += loss.item() | |
| train_iterator.set_postfix({"Loss": f"{loss.item():.4f}"}) | |
| # Clear memory after each batch | |
| if device.type == 'cuda': | |
| del mfcc, image, label, output, loss | |
| torch.cuda.empty_cache() | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error in training batch {i}: {e}") | |
| # Clear memory in case of error | |
| if device.type == 'cuda': | |
| torch.cuda.empty_cache() | |
| continue | |
| # Validation phase | |
| print(f"\033[92mINFO\033[0m: Validating...") | |
| model.eval() | |
| val_loss = 0.0 | |
| val_mae = 0.0 | |
| val_iterator = tqdm(val_loader, desc="Validation") | |
| with torch.no_grad(): | |
| for i, (mfcc, image, label) in enumerate(val_iterator): | |
| try: | |
| mfcc, image, label = mfcc.to(device), image.to(device), label.to(device) | |
| output = model(mfcc, image) | |
| label = label.view(-1, 1).float() | |
| # Calculate MSE loss | |
| loss = criterion(output, label) | |
| val_loss += loss.item() | |
| # Calculate MAE | |
| mae = mae_criterion(output, label) | |
| val_mae += mae.item() | |
| val_iterator.set_postfix({"MSE": f"{loss.item():.4f}", "MAE": f"{mae.item():.4f}"}) | |
| # Clear memory after each batch | |
| if device.type == 'cuda': | |
| del mfcc, image, label, output, loss, mae | |
| torch.cuda.empty_cache() | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error in validation batch {i}: {e}") | |
| # Clear memory in case of error | |
| if device.type == 'cuda': | |
| torch.cuda.empty_cache() | |
| continue | |
| avg_val_loss = val_loss / len(val_loader) if len(val_loader) > 0 else float('inf') | |
| avg_val_mae = val_mae / len(val_loader) if len(val_loader) > 0 else float('inf') | |
| # Test phase | |
| print(f"\033[92mINFO\033[0m: Testing...") | |
| model.eval() | |
| test_loss = 0.0 | |
| test_mae = 0.0 | |
| test_iterator = tqdm(test_loader, desc="Testing") | |
| with torch.no_grad(): | |
| for i, (mfcc, image, label) in enumerate(test_iterator): | |
| try: | |
| mfcc, image, label = mfcc.to(device), image.to(device), label.to(device) | |
| output = model(mfcc, image) | |
| label = label.view(-1, 1).float() | |
| # Calculate MSE loss | |
| loss = criterion(output, label) | |
| test_loss += loss.item() | |
| # Calculate MAE | |
| mae = mae_criterion(output, label) | |
| test_mae += mae.item() | |
| test_iterator.set_postfix({"MSE": f"{loss.item():.4f}", "MAE": f"{mae.item():.4f}"}) | |
| # Clear memory after each batch | |
| if device.type == 'cuda': | |
| del mfcc, image, label, output, loss, mae | |
| torch.cuda.empty_cache() | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error in test batch {i}: {e}") | |
| # Clear memory in case of error | |
| if device.type == 'cuda': | |
| torch.cuda.empty_cache() | |
| continue | |
| avg_test_loss = test_loss / len(test_loader) if len(test_loader) > 0 else float('inf') | |
| avg_test_mae = test_mae / len(test_loader) if len(test_loader) > 0 else float('inf') | |
| results = { | |
| "image_backbone": image_backbone, | |
| "audio_backbone": audio_backbone, | |
| "validation_mse": avg_val_loss, | |
| "validation_mae": avg_val_mae, | |
| "test_mse": avg_test_loss, | |
| "test_mae": avg_test_mae | |
| } | |
| print(f"\033[92mINFO\033[0m: Evaluation Results:") | |
| print(f"Image Backbone: {image_backbone}") | |
| print(f"Audio Backbone: {audio_backbone}") | |
| print(f"Validation MSE: {avg_val_loss:.4f}") | |
| print(f"Validation MAE: {avg_val_mae:.4f}") | |
| print(f"Test MSE: {avg_test_loss:.4f}") | |
| print(f"Test MAE: {avg_test_mae:.4f}") | |
| # Save model if save_model_dir is provided | |
| if save_model_dir: | |
| os.makedirs(save_model_dir, exist_ok=True) | |
| model_filename = f"{image_backbone}_{audio_backbone}_model.pt" | |
| model_path = os.path.join(save_model_dir, model_filename) | |
| torch.save(model.state_dict(), model_path) | |
| print(f"\033[92mINFO\033[0m: Model saved to {model_path}") | |
| # Add model path to results | |
| results["model_path"] = model_path | |
| # Clean up memory before returning | |
| if device.type == 'cuda': | |
| del model, optimizer, criterion, mae_criterion | |
| torch.cuda.empty_cache() | |
| return results | |
| def evaluate_all_combinations(data_dir, image_backbones=None, audio_backbones=None, save_model_dir="test_models", results_file="backbone_evaluation_results.json"): | |
| if image_backbones is None: | |
| image_backbones = list(IMAGE_BACKBONES.keys()) | |
| if audio_backbones is None: | |
| audio_backbones = list(AUDIO_BACKBONES.keys()) | |
| # Create directory for saving models | |
| if save_model_dir: | |
| os.makedirs(save_model_dir, exist_ok=True) | |
| # Load previous results if the file exists | |
| results = [] | |
| evaluated_combinations = set() | |
| if os.path.exists(results_file): | |
| try: | |
| with open(results_file, 'r') as f: | |
| results = json.load(f) | |
| evaluated_combinations = {(r["image_backbone"], r["audio_backbone"]) for r in results} | |
| print(f"\033[92mINFO\033[0m: Loaded {len(results)} previous results from {results_file}") | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error loading previous results from {results_file}: {e}") | |
| results = [] | |
| evaluated_combinations = set() | |
| else: | |
| print(f"\033[93mWARN\033[0m: Results file '{results_file}' does not exist. Starting with empty results.") | |
| # Create combinations to evaluate, skipping any that have already been evaluated | |
| combinations = [(img, aud) for img in image_backbones for aud in audio_backbones | |
| if (img, aud) not in evaluated_combinations] | |
| if len(combinations) < len(image_backbones) * len(audio_backbones): | |
| print(f"\033[92mINFO\033[0m: Skipping {len(evaluated_combinations)} already evaluated combinations") | |
| print(f"\033[92mINFO\033[0m: Will evaluate {len(combinations)} combinations") | |
| for image_backbone, audio_backbone in combinations: | |
| print(f"\033[92mINFO\033[0m: Evaluating {image_backbone} + {audio_backbone}") | |
| try: | |
| # Clean GPU memory before each model evaluation | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print(f"\033[92mINFO\033[0m: CUDA memory cleared before evaluation") | |
| # Print memory usage for debugging | |
| print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| result = evaluate_model(data_dir, image_backbone, audio_backbone, save_model_dir=save_model_dir) | |
| results.append(result) | |
| # Save results after each evaluation | |
| save_results(results, results_file) | |
| print(f"\033[92mINFO\033[0m: Updated results saved to {results_file}") | |
| # Force garbage collection to free memory | |
| import gc | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print(f"\033[92mINFO\033[0m: CUDA memory cleared after evaluation") | |
| # Print memory usage for debugging | |
| print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error evaluating {image_backbone} + {audio_backbone}: {e}") | |
| print(f"\033[91mERR!\033[0m: To continue from this point, use --start_from={image_backbone}:{audio_backbone}") | |
| # Force garbage collection to free memory even if there's an error | |
| import gc | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print(f"\033[92mINFO\033[0m: CUDA memory cleared after error") | |
| continue | |
| # Sort results by test MAE (ascending) | |
| results.sort(key=lambda x: x["test_mae"]) | |
| # Save final sorted results | |
| save_results(results, results_file) | |
| print("\n\033[92mINFO\033[0m: === FINAL RESULTS (Sorted by Test MAE) ===") | |
| print(f"{'Image Backbone':<20} {'Audio Backbone':<20} {'Val MAE':<10} {'Test MAE':<10}") | |
| print("="*60) | |
| for result in results: | |
| print(f"{result['image_backbone']:<20} {result['audio_backbone']:<20} {result['validation_mae']:<10.4f} {result['test_mae']:<10.4f}") | |
| return results | |
| def save_results(results, filename="backbone_evaluation_results.json"): | |
| """Save evaluation results to a JSON file.""" | |
| with open(filename, 'w') as f: | |
| json.dump(results, f, indent=4) | |
| print(f"\033[92mINFO\033[0m: Results saved to {filename}") | |
| if __name__ == "__main__": | |
| import argparse | |
| parser = argparse.ArgumentParser(description="Evaluate Different Backbones for Watermelon Sweetness Prediction") | |
| parser.add_argument( | |
| "--data_dir", | |
| type=str, | |
| default="../cleaned", | |
| help="Path to the cleaned dataset directory" | |
| ) | |
| parser.add_argument( | |
| "--image_backbone", | |
| type=str, | |
| default=None, | |
| help="Specific image backbone to evaluate (leave empty to evaluate all available)" | |
| ) | |
| parser.add_argument( | |
| "--audio_backbone", | |
| type=str, | |
| default=None, | |
| help="Specific audio backbone to evaluate (leave empty to evaluate all available)" | |
| ) | |
| parser.add_argument( | |
| "--evaluate_all", | |
| action="store_true", | |
| help="Evaluate all combinations of backbones" | |
| ) | |
| parser.add_argument( | |
| "--start_from", | |
| type=str, | |
| default=None, | |
| help="Start evaluation from a specific combination, format: 'image_backbone:audio_backbone'" | |
| ) | |
| parser.add_argument( | |
| "--prioritize_efficient", | |
| action="store_true", | |
| help="Prioritize more efficient models first to avoid memory issues" | |
| ) | |
| parser.add_argument( | |
| "--results_file", | |
| type=str, | |
| default="backbone_evaluation_results.json", | |
| help="File to save the evaluation results" | |
| ) | |
| parser.add_argument( | |
| "--load_previous_results", | |
| action="store_true", | |
| help="Load previous results from results_file if it exists" | |
| ) | |
| parser.add_argument( | |
| "--model_dir", | |
| type=str, | |
| default="test_models", | |
| help="Directory to save model checkpoints" | |
| ) | |
| args = parser.parse_args() | |
| # Create model directory if it doesn't exist | |
| if args.model_dir: | |
| os.makedirs(args.model_dir, exist_ok=True) | |
| print(f"\033[92mINFO\033[0m: === Available Image Backbones ===") | |
| for name in IMAGE_BACKBONES.keys(): | |
| print(f"- {name}") | |
| print(f"\033[92mINFO\033[0m: === Available Audio Backbones ===") | |
| for name in AUDIO_BACKBONES.keys(): | |
| print(f"- {name}") | |
| if args.evaluate_all: | |
| evaluate_all_combinations(args.data_dir, results_file=args.results_file, save_model_dir=args.model_dir) | |
| elif args.image_backbone and args.audio_backbone: | |
| result = evaluate_model(args.data_dir, args.image_backbone, args.audio_backbone, save_model_dir=args.model_dir) | |
| save_results([result], args.results_file) | |
| else: | |
| # Define a default set of backbones to evaluate if not specified | |
| if args.prioritize_efficient: | |
| # Start with less memory-intensive models | |
| image_backbones = ["resnet50", "efficientnet_b0", "resnet101", "efficientnet_b3", "convnext_base", "swin_b"] | |
| audio_backbones = ["lstm", "gru", "bidirectional_lstm", "transformer"] | |
| else: | |
| # Default selection focusing on better performance models | |
| image_backbones = ["resnet101", "efficientnet_b3", "swin_b"] | |
| audio_backbones = ["lstm", "bidirectional_lstm", "transformer"] | |
| # Create all combinations | |
| combinations = [(img, aud) for img in image_backbones for aud in audio_backbones] | |
| # Load previous results if requested and file exists | |
| previous_results = [] | |
| previous_combinations = set() | |
| if args.load_previous_results: | |
| try: | |
| if os.path.exists(args.results_file): | |
| with open(args.results_file, 'r') as f: | |
| previous_results = json.load(f) | |
| previous_combinations = {(r["image_backbone"], r["audio_backbone"]) for r in previous_results} | |
| print(f"\033[92mINFO\033[0m: Loaded {len(previous_results)} previous results") | |
| else: | |
| print(f"\033[93mWARN\033[0m: Results file '{args.results_file}' does not exist. Starting with empty results.") | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error loading previous results: {e}") | |
| previous_results = [] | |
| previous_combinations = set() | |
| # If starting from a specific point | |
| if args.start_from: | |
| try: | |
| start_img, start_aud = args.start_from.split(':') | |
| start_idx = combinations.index((start_img, start_aud)) | |
| combinations = combinations[start_idx:] | |
| print(f"\033[92mINFO\033[0m: Starting from combination: {start_img} (image) + {start_aud} (audio)") | |
| except (ValueError, IndexError): | |
| print(f"\033[91mERR!\033[0m: Invalid start_from format or combination not found. Format should be 'image_backbone:audio_backbone'") | |
| print(f"\033[91mERR!\033[0m: Continuing with all combinations.") | |
| # Skip combinations that have already been evaluated | |
| if previous_combinations: | |
| original_count = len(combinations) | |
| combinations = [(img, aud) for img, aud in combinations if (img, aud) not in previous_combinations] | |
| print(f"\033[92mINFO\033[0m: Skipping {original_count - len(combinations)} already evaluated combinations") | |
| # Evaluate each combination | |
| results = previous_results.copy() | |
| for img_backbone, audio_backbone in combinations: | |
| print(f"\033[92mINFO\033[0m: Evaluating {img_backbone} + {audio_backbone}") | |
| try: | |
| # Clean GPU memory before each model evaluation | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print(f"\033[92mINFO\033[0m: CUDA memory cleared before evaluation") | |
| print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| result = evaluate_model(args.data_dir, img_backbone, audio_backbone, save_model_dir=args.model_dir) | |
| results.append(result) | |
| # Save results after each evaluation | |
| save_results(results, args.results_file) | |
| # Force garbage collection to free memory | |
| import gc | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print(f"\033[92mINFO\033[0m: CUDA memory cleared after evaluation") | |
| print(f"\033[92mINFO\033[0m: CUDA memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") | |
| print(f"\033[92mINFO\033[0m: CUDA memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") | |
| except Exception as e: | |
| print(f"\033[91mERR!\033[0m: Error evaluating {img_backbone} + {audio_backbone}: {e}") | |
| print(f"\033[91mERR!\033[0m: To continue from this point later, use --start_from={img_backbone}:{audio_backbone}") | |
| # Force garbage collection to free memory even if there's an error | |
| import gc | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| print(f"\033[92mINFO\033[0m: CUDA memory cleared after error") | |
| continue | |
| # Sort results by test MAE (ascending) | |
| results.sort(key=lambda x: x["test_mae"]) | |
| # Save final sorted results | |
| save_results(results, args.results_file) | |
| print("\n\033[92mINFO\033[0m: === FINAL RESULTS (Sorted by Test MAE) ===") | |
| print(f"{'Image Backbone':<20} {'Audio Backbone':<20} {'Val MAE':<10} {'Test MAE':<10}") | |
| print("="*60) | |
| for result in results: | |
| print(f"{result['image_backbone']:<20} {result['audio_backbone']:<20} {result['validation_mae']:<10.4f} {result['test_mae']:<10.4f}") |