Spaces:

AIvry
/

MAPSS-measures

Sleeping

App Files Files Community

AIvry commited on Sep 15

Commit

b759ccc

verified ·

1 Parent(s): 424791f

Upload 11 files

Browse files

Files changed (10) hide show

app.py +35 -17
argshield.py +1 -2
audio.py +49 -32
config.py +35 -35
distortions.py +6 -2
engine.py +206 -185
main.py +8 -0
metrics.py +64 -12
models.py +43 -18
utils.py +27 -90

app.py CHANGED Viewed

@@ -66,6 +66,8 @@ def process_audio_files(zip_file, model_name, layer, alpha):
             return None, "No reference WAV files found"
         if len(out_files) == 0:
             return None, "No output WAV files found"
         # Create manifest
         manifest = [{
@@ -92,8 +94,8 @@ def process_audio_files(zip_file, model_name, layer, alpha):
             }
             layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
-        # Check GPU availability
-        max_gpus = 1 if torch.cuda.is_available() else 0
         # Run experiment
         results_dir = compute_mapss_measures(
@@ -103,7 +105,7 @@ def process_audio_files(zip_file, model_name, layer, alpha):
             alpha=alpha,
             verbose=True,
             max_gpus=max_gpus,
-            add_ci=False
         )
         # Create output ZIP at a fixed location
@@ -121,7 +123,7 @@ def process_audio_files(zip_file, model_name, layer, alpha):
                     files_added += 1
         if output_zip.exists() and files_added > 0:
-            return str(output_zip), f"Processing completed! Created ZIP with {files_added} files."
         else:
             return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
@@ -143,6 +145,13 @@ def create_interface():
         - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
         - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
         ## Input Format
         Upload a ZIP file containing:
@@ -152,9 +161,9 @@ def create_interface():
         │   ├── speaker1.wav
         │   ├── speaker2.wav
         │   └── ...
-        └── outputs/         # Separated outputs from your algorithm
-            ├── separated1.wav
-            ├── separated2.wav
             └── ...
         ```
@@ -162,16 +171,22 @@ def create_interface():
         - Format: .wav files
         - Sample rate: Any (automatically resampled to 16kHz)
         - Channels: Mono or stereo (converted to mono)
-        - Number of files: Equal number of references and outputs
         ## Output Format
         The tool generates a ZIP file containing:
-        - `ps_scores_{model}.csv`: PS scores for each source
-        - `pm_scores_{model}.csv`: PM scores for each source
         - `params.json`: Parameters used
         - `manifest_canonical.json`: File mapping and processing details
         ## Available Models
         | Model | Description | Default Layer | Use Case |
@@ -179,10 +194,10 @@ def create_interface():
         | `raw` | Raw waveform features | N/A | Baseline comparison |
         | `wavlm` | WavLM Large | 24 | Strong performance |
         | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
-        | `hubert` | HuBERT Large | 24 | |
-        | `wavlm_base` | WavLM Base | 12 |  |
         | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
-        | `hubert_base` | HuBERT Base | 12 | |
         | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
         ## Parameters
@@ -193,6 +208,13 @@ def create_interface():
           - 0.0 = No normalization
           - 1.0 = Full normalization (recommended)
         ## Citation
         If you use MAPSS, please cite:
@@ -207,10 +229,6 @@ def create_interface():
         }
         ```
-        ## Limitations
-        - Processing time scales with number of sources, audio length and model size
         ## License
         Code: MIT License

             return None, "No reference WAV files found"
         if len(out_files) == 0:
             return None, "No output WAV files found"
+        if len(ref_files) != len(out_files):
+            return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order."
         # Create manifest
         manifest = [{
             }
             layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
+        # Check GPU availability - use all available GPUs on the space
+        max_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
         # Run experiment
         results_dir = compute_mapss_measures(
             alpha=alpha,
             verbose=True,
             max_gpus=max_gpus,
+            add_ci=False  # Disable CI for faster processing in demo
         )
         # Create output ZIP at a fixed location
                     files_added += 1
         if output_zip.exists() and files_added > 0:
+            return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
         else:
             return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
         - **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
         - **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
+        ## ⚠️ IMPORTANT: File Order Requirements
+        **Output files MUST be in the same order as reference files!**
+        - If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
+        - Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
+        - Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
         ## Input Format
         Upload a ZIP file containing:
         │   ├── speaker1.wav
         │   ├── speaker2.wav
         │   └── ...
+        └── outputs/         # Separated outputs (SAME ORDER as references)
+            ├── separated1.wav  # Must correspond to speaker1.wav
+            ├── separated2.wav  # Must correspond to speaker2.wav
             └── ...
         ```
         - Format: .wav files
         - Sample rate: Any (automatically resampled to 16kHz)
         - Channels: Mono or stereo (converted to mono)
+        - **Number of files: Equal number of references and outputs**
+        - **Order: Output files must be in the same order as reference files**
         ## Output Format
         The tool generates a ZIP file containing:
+        - `ps_scores_{model}.csv`: PS scores for each source over time
+        - `pm_scores_{model}.csv`: PM scores for each source over time
         - `params.json`: Parameters used
         - `manifest_canonical.json`: File mapping and processing details
+        ### Score Interpretation
+        - **NaN values**: Appear in frames where fewer than 2 speakers are active
+        - **Valid scores**: Only computed when at least 2 speakers are active in a frame
+        - **Time resolution**: 20ms frames (configurable in code)
         ## Available Models
         | Model | Description | Default Layer | Use Case |
         | `raw` | Raw waveform features | N/A | Baseline comparison |
         | `wavlm` | WavLM Large | 24 | Strong performance |
         | `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
+        | `hubert` | HuBERT Large | 24 | Good for speech |
+        | `wavlm_base` | WavLM Base | 12 | Faster processing |
         | `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
+        | `hubert_base` | HuBERT Base | 12 | Faster processing |
         | `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
         ## Parameters
           - 0.0 = No normalization
           - 1.0 = Full normalization (recommended)
+        ## Processing Notes
+        - The system automatically detects which speakers are active in each frame
+        - PS/PM scores are only computed between active speakers
+        - Processing time scales with number of sources and audio length
+        - GPU acceleration is automatically used when available
         ## Citation
         If you use MAPSS, please cite:
         }
         ```
         ## License
         Code: MIT License

argshield.py CHANGED Viewed

@@ -7,7 +7,6 @@ import importlib.util
 from config import DEFAULT_ALPHA
 from models import get_model_config
-# Central table for default layers per model (kept identical to original table)
 MODEL_DEFAULT_LAYER = {
     "raw": None,
     "wavlm": 24,
@@ -31,7 +30,7 @@ def _read_manifest_py(path: Path):
     if spec is None or spec.loader is None:
         raise SystemExit(f"Could not load Python manifest: {path}")
     mod = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(mod)  # executes the .py file
     if not hasattr(mod, "MANIFEST"):
         raise SystemExit(f"Python manifest {path} must define a top-level variable MANIFEST")

 from config import DEFAULT_ALPHA
 from models import get_model_config
 MODEL_DEFAULT_LAYER = {
     "raw": None,
     "wavlm": 24,
     if spec is None or spec.loader is None:
         raise SystemExit(f"Could not load Python manifest: {path}")
     mod = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(mod)
     if not hasattr(mod, "MANIFEST"):
         raise SystemExit(f"Python manifest {path} must define a top-level variable MANIFEST")

audio.py CHANGED Viewed

@@ -1,15 +1,21 @@
-import librosa
 import numpy as np
 import pyloudnorm as pyln
 import torch
 from config import SILENCE_RATIO, SR
-from utils import hungarian, safe_corr_np
 import warnings
 warnings.filterwarnings("ignore", message="Possible clipped samples in output.")
 def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
     meter = pyln.Meter(sr)
     loudness = meter.integrated_loudness(wav)
     normalized_wav = pyln.normalize.loudness(wav, loudness, target_lufs)
@@ -20,42 +26,53 @@ def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
 def frame_rms_torch(sig, win, hop):
     dev = sig.device
     frames = sig.unfold(0, win, hop)
     if frames.size(0) and (frames.size(0) - 1) * hop == sig.numel() - win:
         frames = frames[:-1]
-    rms = torch.sqrt((frames**2).mean(1) + 1e-12)
     return rms.to(dev)
-def make_union_voiced_mask(refs_tensors, win, hop):
     device = refs_tensors[0].device
-    rms_vecs = [frame_rms_torch(r, win, hop) for r in refs_tensors]
-    lengths = [v.numel() for v in rms_vecs]
     L_max = max(lengths)
-    silent_union = torch.zeros(L_max, dtype=torch.bool, device=device)
-    for idx, (rms, L) in enumerate(zip(rms_vecs, lengths)):
-        thr = SILENCE_RATIO * torch.sqrt((refs_tensors[idx] ** 2).mean())
-        sil = rms <= thr
-        silent_union[:L] |= sil
-    return ~silent_union
-def assign_outputs_to_refs_by_corr(ref_paths, out_paths):
-    if not out_paths:
-        return [None] * len(ref_paths)
-    refs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in ref_paths]
-    outs = [loudness_normalize(librosa.load(str(p), sr=SR)[0]) for p in out_paths]
-    n, m = len(refs), len(outs)
-    K = max(n, m)
-    C = np.ones((K, K), dtype=np.float64)
-    for i in range(n):
-        for j in range(m):
-            r = safe_corr_np(refs[i], outs[j])
-            C[i, j] = 1.0 - (r + 1.0) * 0.5  # lower = better
-    ri, cj = hungarian(C)
-    mapping = [None] * n
-    for i, j in zip(ri, cj):
-        if i < n and j < m:
-            mapping[i] = int(j)
-    return mapping

 import numpy as np
 import pyloudnorm as pyln
 import torch
 from config import SILENCE_RATIO, SR
 import warnings
 warnings.filterwarnings("ignore", message="Possible clipped samples in output.")
 def loudness_normalize(wav, sr=SR, target_lufs=-23.0):
+    """
+    Apply loudness normalization on an audio signal.
+    :param wav: waveform signal to normalize.
+    :param sr: sampling rate.
+    :param target_lufs: LUFS points to normalize to.
+    :return: normalized signal.
+    """
     meter = pyln.Meter(sr)
     loudness = meter.integrated_loudness(wav)
     normalized_wav = pyln.normalize.loudness(wav, loudness, target_lufs)
 def frame_rms_torch(sig, win, hop):
+    """
+    Calculates the RMS of a signal with a moving window.
+    :param sig: signal for calculation.
+    :param win: analysis window size.
+    :param hop: analysis window hop size.
+    :return: RMS of signal.
+    """
     dev = sig.device
     frames = sig.unfold(0, win, hop)
     if frames.size(0) and (frames.size(0) - 1) * hop == sig.numel() - win:
         frames = frames[:-1]
+    rms = torch.sqrt((frames ** 2).mean(1) + 1e-12)
     return rms.to(dev)
+def compute_speaker_activity_masks(refs_tensors, win, hop):
+    """
+    Computes individual voice activity for each speaker and determines which frames
+    have at least 2 active speakers.
+    :param refs_tensors: references that compose the mixture.
+    :param win: analysis window size.
+    :param hop: analysis window hop size.
+    :return: (multi_speaker_mask, individual_speaker_masks)
+        - multi_speaker_mask: boolean mask of frames where at least 2 speakers are active
+        - individual_speaker_masks: list of boolean masks, one per speaker
+    """
     device = refs_tensors[0].device
+    individual_masks = []
+    lengths = []
+    for ref in refs_tensors:
+        rms = frame_rms_torch(ref, win, hop)
+        threshold = SILENCE_RATIO * torch.sqrt((ref ** 2).mean())
+        voiced = rms > threshold
+        individual_masks.append(voiced)
+        lengths.append(voiced.numel())
     L_max = max(lengths)
+    padded_masks = []
+    for mask, L in zip(individual_masks, lengths):
+        if L < L_max:
+            padded = torch.cat([mask, torch.zeros(L_max - L, dtype=torch.bool, device=device)])
+        else:
+            padded = mask
+        padded_masks.append(padded)
+    stacked = torch.stack(padded_masks, dim=0)
+    active_count = stacked.sum(dim=0)
+    multi_speaker_mask = active_count >= 2
+    return multi_speaker_mask, padded_masks

config.py CHANGED Viewed

@@ -1,35 +1,35 @@
-import os
-import torch
-import warnings
-warnings.filterwarnings(
-    "ignore",
-    category=UserWarning,
-    message=r"^expandable_segments not supported on this platform"
-)
-SR = 16_000
-RESULTS_ROOT = "results"
-BATCH_SIZE = 2
-ENERGY_WIN_MS = 20
-ENERGY_HOP_MS = 20
-SILENCE_RATIO = 0.1
-EPS = 1e-4
-COV_TOL = 1e-6
-DEFAULT_LAYER = 2
-DEFAULT_ADD_CI = True
-DEFAULT_DELTA_CI = 0.05
-DEFAULT_ALPHA = 1.0
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.6"
-os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
-torch.backends.cudnn.benchmark = True
-torch.backends.cudnn.deterministic = False
-torch.backends.cudnn.enabled = True
-# Only set CUDA memory fraction if we're not in the main process on HF Spaces
-if not (os.environ.get("SPACE_ID") and torch.cuda.is_available()):
-    if torch.cuda.is_available():
-        torch.cuda.set_per_process_memory_fraction(0.8)

+"""
+Basic configuration and default values used in the MAPSS computations.
+"""
+import os
+import torch
+import warnings
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    message=r"^expandable_segments not supported on this platform"
+)
+SR = 16_000
+RESULTS_ROOT = "results"
+BATCH_SIZE = 2
+ENERGY_WIN_MS = 20
+ENERGY_HOP_MS = 20
+SILENCE_RATIO = 0.1
+EPS = 1e-4
+COV_TOL = 1e-6
+DEFAULT_LAYER = 2
+DEFAULT_ADD_CI = True
+DEFAULT_DELTA_CI = 0.05
+DEFAULT_ALPHA = 1.0
+os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True,garbage_collection_threshold:0.6"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
+torch.backends.cudnn.benchmark = True
+torch.backends.cudnn.deterministic = False
+torch.backends.cudnn.enabled = True
+if torch.cuda.is_available():
+    torch.cuda.set_per_process_memory_fraction(0.8)

distortions.py CHANGED Viewed

@@ -1,3 +1,7 @@
 import librosa
 import numpy as np
 from numpy.fft import irfft, rfft, rfftfreq
@@ -156,7 +160,7 @@ def frame_distortions(
     return distortions
-def apply_adv_distortions(ref, distortion_keys, sr=SR):
     frame_len = int(ENERGY_WIN_MS * sr / 1000)
     n_frames = int(np.ceil(len(ref) / frame_len))
     pad_len = n_frames * frame_len - len(ref)
@@ -222,7 +226,7 @@ def apply_adv_distortions(ref, distortion_keys, sr=SR):
     return list(out.values())
-def apply_distortions(ref, distortion_keys, sr=SR):
     distortions = {}
     X = rfft(ref)
     freqs = rfftfreq(len(ref), 1 / sr)

+"""
+Distortions banks for the PS and the PM computations.
+"""
 import librosa
 import numpy as np
 from numpy.fft import irfft, rfft, rfftfreq
     return distortions
+def apply_pm_distortions(ref, distortion_keys, sr=SR):
     frame_len = int(ENERGY_WIN_MS * sr / 1000)
     n_frames = int(np.ceil(len(ref) / frame_len))
     pad_len = n_frames * frame_len - len(ref)
     return list(out.values())
+def apply_ps_distortions(ref, distortion_keys, sr=SR):
     distortions = {}
     X = rfft(ref)
     freqs = rfftfreq(len(ref), 1 / sr)

engine.py CHANGED Viewed

@@ -4,14 +4,12 @@ from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 import librosa
 import pandas as pd
-import numpy as np
 from audio import (
-    assign_outputs_to_refs_by_corr,
     loudness_normalize,
-    make_union_voiced_mask,
 )
 from config import *
-from distortions import apply_adv_distortions, apply_distortions
 from metrics import (
     compute_pm,
     compute_ps,
@@ -38,6 +36,23 @@ def compute_mapss_measures(
         verbose=False,
         max_gpus=None,
 ):
     gpu_distributor = GPUWorkDistributor(max_gpus)
     ngpu = get_gpu_count(max_gpus)
@@ -64,13 +79,17 @@ def compute_mapss_measures(
     for m, mix_entries in zip(canon_mix, mixture_entries):
         for algo, out_list in (m.systems or {}).items():
-            mapping = assign_outputs_to_refs_by_corr(
-                [e["ref"] for e in mix_entries], out_list
-            )
             for idx, e in enumerate(mix_entries):
-                j = mapping[idx]
-                if j is not None:
-                    e["outs"][algo] = out_list[j]
     if algos is None:
         algos_to_run = sorted(
@@ -114,6 +133,7 @@ def compute_mapss_measures(
     print(f"Starting experiment {exp_id} with {ngpu} GPUs")
     print(f"Results will be saved to: {exp_root}")
     clear_gpu_memory()
     get_gpu_memory_info(verbose)
@@ -128,63 +148,53 @@ def compute_mapss_measures(
         all_refs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
     if verbose:
-        print("Computing voiced masks...")
     win = int(ENERGY_WIN_MS * SR / 1000)
     hop = int(ENERGY_HOP_MS * SR / 1000)
-    voiced_mask_mix = []
-    total_frames_per_mix = []  # Store total frames for each mixture
     for i, mix in enumerate(mixture_entries):
         if verbose:
-            print(f"  Computing mask for mixture {i + 1}/{len(mixture_entries)}")
         if ngpu > 0:
             with torch.cuda.device(0):
                 refs_for_mix = [all_refs[e["id"]].cuda() for e in mix]
-                mask = make_union_voiced_mask(refs_for_mix, win, hop)
-                voiced_mask_mix.append(mask.cpu())
-                total_frames_per_mix.append(mask.shape[0])
-                # Explicitly delete GPU tensors
                 for ref in refs_for_mix:
                     del ref
                 torch.cuda.empty_cache()
         else:
             refs_for_mix = [all_refs[e["id"]].cpu() for e in mix]
-            mask = make_union_voiced_mask(refs_for_mix, win, hop)
-            voiced_mask_mix.append(mask.cpu())
-            total_frames_per_mix.append(mask.shape[0])
     ordered_speakers = [e["id"] for e in flat_entries]
-    # Initialize storage for all mixtures and algorithms
-    all_mixture_results = {}  # mixture_id -> {algo -> {model -> data}}
     for mix_idx, (mix_canon, mix_entries) in enumerate(zip(canon_mix, mixture_entries)):
         mixture_id = mix_canon.mixture_id
         all_mixture_results[mixture_id] = {}
-        # Get total frames for this mixture
         total_frames = total_frames_per_mix[mix_idx]
-        # Get speakers for this mixture
         mixture_speakers = [e["id"] for e in mix_entries]
         for algo_idx, algo in enumerate(algos_to_run):
             if verbose:
                 print(f"\nProcessing Mixture {mixture_id}, Algorithm {algo_idx + 1}/{len(algos_to_run)}: {algo}")
-            # Remove the old algo_dir creation here - we don't need these empty folders anymore
             all_outs = {}
             missing = []
             for e in mix_entries:
                 assigned_path = e.get("outs", {}).get(algo)
                 if assigned_path is None:
                     missing.append((e["mixture"], e["id"]))
                     continue
                 wav, _ = librosa.load(str(assigned_path), sr=SR)
                 all_outs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
@@ -201,11 +211,9 @@ def compute_mapss_measures(
                     warnings.warn(f"[{algo}] No outputs for mixture {mixture_id}. Skipping.")
                 continue
-            # Initialize storage for this algorithm
             if algo not in all_mixture_results[mixture_id]:
                 all_mixture_results[mixture_id][algo] = {}
-            # Initialize frame-wise storage with NaN for all frames
             ps_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
             pm_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
             ps_bias_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
@@ -224,7 +232,6 @@ def compute_mapss_measures(
                     model_wrapper, layer_eff = load_model(mname, layer, max_gpus)
                     get_gpu_memory_info(verbose)
-                    # Process only this mixture
                     speakers_this_mix = [e for e in mix_entries if e["id"] in all_outs]
                     if not speakers_this_mix:
                         continue
@@ -232,22 +239,25 @@ def compute_mapss_measures(
                     if verbose:
                         print(f"    Processing {metric_type} for mixture {mixture_id}")
-                    all_signals_mix = []
-                    all_masks_mix = []
-                    all_labels_mix = []
-                    for e in speakers_this_mix:
                         s = e["id"]
                         if metric_type == "PS":
                             dists = [
                                 loudness_normalize(d)
-                                for d in apply_distortions(all_refs[s].numpy(), "all")
                             ]
                         else:
                             dists = [
                                 loudness_normalize(d)
-                                for d in apply_adv_distortions(
                                     all_refs[s].numpy(), "all"
                                 )
                             ]
@@ -255,19 +265,20 @@ def compute_mapss_measures(
                         sigs = [all_refs[s].numpy(), all_outs[s].numpy()] + dists
                         lbls = ["ref", "out"] + [f"d{i}" for i in range(len(dists))]
-                        masks = [voiced_mask_mix[mix_idx]] * len(sigs)
-                        all_signals_mix.extend(sigs)
-                        all_masks_mix.extend(masks)
-                        all_labels_mix.extend([f"{s}-{l}" for l in lbls])
-                    try:
-                        # Process in smaller batches
                         batch_size = min(2, BATCH_SIZE)
                         embeddings_list = []
-                        for i in range(0, len(all_signals_mix), batch_size):
-                            batch_sigs = all_signals_mix[i:i + batch_size]
-                            batch_masks = all_masks_mix[i:i + batch_size]
                             batch_embs = embed_batch(
                                 batch_sigs,
@@ -283,139 +294,156 @@ def compute_mapss_measures(
                             torch.cuda.empty_cache()
                         if embeddings_list:
-                            embeddings = torch.cat(embeddings_list, dim=0)
-                            E, L, D = embeddings.shape
-                            if L == 0:
-                                if verbose:
-                                    print(
-                                        f"        WARNING: mixture {mixture_id} produced 0 frames after masking; skipping.")
-                                continue
-                            # Get valid frame indices for this mixture
-                            mask = voiced_mask_mix[mix_idx]
-                            valid_frame_indices = torch.where(mask)[0].tolist()
-                            if verbose:
-                                print(f"    Computing {metric_type} scores for {mname}...")
-                            # Process frames with their stored embeddings and labels
-                            with ThreadPoolExecutor(
-                                    max_workers=min(2, ngpu if ngpu > 0 else 1)
-                            ) as executor:
-                                def process_frame(f, frame_idx, embeddings_mix, labels_mix):
-                                    try:
-                                        frame_emb = embeddings_mix[:, f, :].detach().cpu().numpy()
-                                        if add_ci:
-                                            coords_d, coords_c, eigvals, k_sub_gauss = (
-                                                gpu_distributor.execute_on_gpu(
-                                                    diffusion_map_torch,
-                                                    frame_emb,
-                                                    labels_mix,
-                                                    alpha=alpha,
-                                                    eig_solver="full",
-                                                    return_eigs=True,
-                                                    return_complement=True,
-                                                    return_cval=add_ci,
-                                                )
-                                            )
-                                        else:
-                                            coords_d = gpu_distributor.execute_on_gpu(
-                                                diffusion_map_torch,
-                                                frame_emb,
-                                                labels_mix,
-                                                alpha=alpha,
-                                                eig_solver="full",
-                                                return_eigs=False,
-                                                return_complement=False,
-                                                return_cval=False,
-                                            )
-                                            coords_c = None
-                                            eigvals = None
-                                            k_sub_gauss = 1
-                                        if metric_type == "PS":
-                                            score = compute_ps(
-                                                coords_d, labels_mix, max_gpus
-                                            )
-                                            bias = prob = None
-                                            if add_ci:
-                                                bias, prob = ps_ci_components_full(
-                                                    coords_d,
-                                                    coords_c,
-                                                    eigvals,
-                                                    labels_mix,
-                                                    delta=DEFAULT_DELTA_CI,
-                                                )
-                                            return frame_idx, "PS", score, bias, prob
-                                        else:
-                                            score = compute_pm(
-                                                coords_d, labels_mix, "gamma", max_gpus
-                                            )
-                                            bias = prob = None
-                                            if add_ci:
-                                                bias, prob = pm_ci_components_full(
-                                                    coords_d,
-                                                    coords_c,
-                                                    eigvals,
-                                                    labels_mix,
-                                                    delta=DEFAULT_DELTA_CI,
-                                                    K=k_sub_gauss,
-                                                )
-                                            return frame_idx, "PM", score, bias, prob
-                                    except Exception as ex:
-                                        if verbose:
-                                            print(f"        ERROR frame {frame_idx}: {ex}")
-                                        return None
-                                futures = [
-                                    executor.submit(process_frame, f, valid_frame_indices[f], embeddings,
-                                                    all_labels_mix)
-                                    for f in range(L)
-                                ]
-                                for fut in futures:
-                                    result = fut.result()
-                                    if result is None:
-                                        continue
-                                    frame_idx, metric, score, bias, prob = result
-                                    if metric == "PS":
-                                        for sp in score:
-                                            if sp in mixture_speakers:
-                                                ps_frames[mname][sp][frame_idx] = score[sp]
-                                                if add_ci and bias is not None:
-                                                    ps_bias_frames[mname][sp][frame_idx] = bias[sp]
-                                                    ps_prob_frames[mname][sp][frame_idx] = prob[sp]
-                                    else:
-                                        for sp in score:
-                                            if sp in mixture_speakers:
-                                                pm_frames[mname][sp][frame_idx] = score[sp]
-                                                if add_ci and bias is not None:
-                                                    pm_bias_frames[mname][sp][frame_idx] = bias[sp]
-                                                    pm_prob_frames[mname][sp][frame_idx] = prob[sp]
-                    except Exception as ex:
                         if verbose:
-                            print(f"      ERROR processing mixture {mixture_id}: {ex}")
                         continue
-                    finally:
-                        # Always clean up after processing a mixture
-                        del all_signals_mix, all_masks_mix
-                        if 'embeddings_list' in locals():
-                            del embeddings_list
-                        clear_gpu_memory()
-                        gc.collect()
                     del model_wrapper
                     clear_gpu_memory()
                     gc.collect()
-            # Store results for this mixture and algorithm
             all_mixture_results[mixture_id][algo][mname] = {
                 'ps_frames': ps_frames[mname],
                 'pm_frames': pm_frames[mname],
@@ -426,20 +454,16 @@ def compute_mapss_measures(
                 'total_frames': total_frames
             }
-        # Save results for this mixture after processing all algorithms
         if verbose:
-            print(f"  Saving results for mixture {mixture_id}...")
-        # Create timestamps in milliseconds - using lowercase hop
         timestamps_ms = [i * hop * 1000 / SR for i in range(total_frames)]
         for model in models:
-            # Prepare PS data
             ps_data = {'timestamp_ms': timestamps_ms}
             pm_data = {'timestamp_ms': timestamps_ms}
             ci_data = {'timestamp_ms': timestamps_ms} if add_ci else None
-            # Combine data from all algorithms for this mixture
             for algo in algos_to_run:
                 if algo not in all_mixture_results[mixture_id]:
                     continue
@@ -448,7 +472,6 @@ def compute_mapss_measures(
                 model_data = all_mixture_results[mixture_id][algo][model]
-                # Add PS data
                 for speaker in mixture_speakers:
                     col_name = f"{algo}_{speaker}"
                     ps_data[col_name] = model_data['ps_frames'][speaker]
@@ -460,7 +483,6 @@ def compute_mapss_measures(
                         ci_data[f"{algo}_{speaker}_pm_bias"] = model_data['pm_bias_frames'][speaker]
                         ci_data[f"{algo}_{speaker}_pm_prob"] = model_data['pm_prob_frames'][speaker]
-            # Save CSV files for this mixture
             mixture_dir = os.path.join(exp_root, mixture_id)
             os.makedirs(mixture_dir, exist_ok=True)
@@ -487,9 +509,8 @@ def compute_mapss_measures(
     print(f"\nEXPERIMENT COMPLETED")
     print(f"Results saved to: {exp_root}")
-    del all_refs, voiced_mask_mix
-    # Import and call the cleanup function
     from models import cleanup_all_models
     cleanup_all_models()

 from datetime import datetime
 import librosa
 import pandas as pd
 from audio import (
     loudness_normalize,
+    compute_speaker_activity_masks,
 )
 from config import *
+from distortions import apply_pm_distortions, apply_ps_distortions
 from metrics import (
     compute_pm,
     compute_ps,
         verbose=False,
         max_gpus=None,
 ):
+    """
+    Compute MAPSS measures (PM, PS, and their errors). Data is saved to csv files.
+    :param models: backbone self-supervised models.
+    :param mixtures: data to process from _read_manifest
+    :param systems: specific systems (algos and data)
+    :param algos: specific algorithms to use
+    :param experiment_id: user-specified name for experiment
+    :param layer: transformer layer of model to consider
+    :param add_ci: True will compute error radius and tail bounds. False will not.
+    :param alpha: normalization factor of the diffusion maps. Lives in [0, 1].
+    :param seed: random seed number.
+    :param on_missing: "skip" when missing values or throw an "error".
+    :param verbose: True will print process info to console during runtime. False will minimize it.
+    :param max_gpus: maximal amount of GPUs the program tries to utilize in parallel.
+    """
     gpu_distributor = GPUWorkDistributor(max_gpus)
     ngpu = get_gpu_count(max_gpus)
     for m, mix_entries in zip(canon_mix, mixture_entries):
         for algo, out_list in (m.systems or {}).items():
+            if len(out_list) != len(mix_entries):
+                msg = f"[{algo}] Number of outputs ({len(out_list)}) does not match number of references ({len(mix_entries)}) for mixture {m.mixture_id}"
+                if on_missing == "error":
+                    raise ValueError(msg)
+                else:
+                    if verbose:
+                        warnings.warn(msg + " Skipping this algorithm.")
+                    continue
             for idx, e in enumerate(mix_entries):
+                e["outs"][algo] = out_list[idx]
     if algos is None:
         algos_to_run = sorted(
     print(f"Starting experiment {exp_id} with {ngpu} GPUs")
     print(f"Results will be saved to: {exp_root}")
+    print("NOTE: Output files must be provided in the same order as reference files.")
     clear_gpu_memory()
     get_gpu_memory_info(verbose)
         all_refs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
     if verbose:
+        print("Computing speaker activity masks...")
     win = int(ENERGY_WIN_MS * SR / 1000)
     hop = int(ENERGY_HOP_MS * SR / 1000)
+    multi_speaker_masks_mix = []
+    individual_speaker_masks_mix = []
+    total_frames_per_mix = []
     for i, mix in enumerate(mixture_entries):
         if verbose:
+            print(f"  Computing masks for mixture {i + 1}/{len(mixture_entries)}")
         if ngpu > 0:
             with torch.cuda.device(0):
                 refs_for_mix = [all_refs[e["id"]].cuda() for e in mix]
+                multi_mask, individual_masks = compute_speaker_activity_masks(refs_for_mix, win, hop)
+                multi_speaker_masks_mix.append(multi_mask.cpu())
+                individual_speaker_masks_mix.append([m.cpu() for m in individual_masks])
+                total_frames_per_mix.append(multi_mask.shape[0])
                 for ref in refs_for_mix:
                     del ref
                 torch.cuda.empty_cache()
         else:
             refs_for_mix = [all_refs[e["id"]].cpu() for e in mix]
+            multi_mask, individual_masks = compute_speaker_activity_masks(refs_for_mix, win, hop)
+            multi_speaker_masks_mix.append(multi_mask.cpu())
+            individual_speaker_masks_mix.append([m.cpu() for m in individual_masks])
+            total_frames_per_mix.append(multi_mask.shape[0])
     ordered_speakers = [e["id"] for e in flat_entries]
+    all_mixture_results = {}
     for mix_idx, (mix_canon, mix_entries) in enumerate(zip(canon_mix, mixture_entries)):
         mixture_id = mix_canon.mixture_id
         all_mixture_results[mixture_id] = {}
         total_frames = total_frames_per_mix[mix_idx]
         mixture_speakers = [e["id"] for e in mix_entries]
         for algo_idx, algo in enumerate(algos_to_run):
             if verbose:
                 print(f"\nProcessing Mixture {mixture_id}, Algorithm {algo_idx + 1}/{len(algos_to_run)}: {algo}")
             all_outs = {}
             missing = []
             for e in mix_entries:
                 assigned_path = e.get("outs", {}).get(algo)
                 if assigned_path is None:
                     missing.append((e["mixture"], e["id"]))
                     continue
                 wav, _ = librosa.load(str(assigned_path), sr=SR)
                 all_outs[e["id"]] = torch.from_numpy(loudness_normalize(wav))
                     warnings.warn(f"[{algo}] No outputs for mixture {mixture_id}. Skipping.")
                 continue
             if algo not in all_mixture_results[mixture_id]:
                 all_mixture_results[mixture_id][algo] = {}
             ps_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
             pm_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
             ps_bias_frames = {m: {s: [np.nan] * total_frames for s in mixture_speakers} for m in models}
                     model_wrapper, layer_eff = load_model(mname, layer, max_gpus)
                     get_gpu_memory_info(verbose)
                     speakers_this_mix = [e for e in mix_entries if e["id"] in all_outs]
                     if not speakers_this_mix:
                         continue
                     if verbose:
                         print(f"    Processing {metric_type} for mixture {mixture_id}")
+                    multi_speaker_mask = multi_speaker_masks_mix[mix_idx]
+                    individual_masks = individual_speaker_masks_mix[mix_idx]
+                    valid_frame_indices = torch.where(multi_speaker_mask)[0].tolist()
+                    speaker_signals = {}
+                    speaker_labels = {}
+                    for speaker_idx, e in enumerate(speakers_this_mix):
                         s = e["id"]
                         if metric_type == "PS":
                             dists = [
                                 loudness_normalize(d)
+                                for d in apply_ps_distortions(all_refs[s].numpy(), "all")
                             ]
                         else:
                             dists = [
                                 loudness_normalize(d)
+                                for d in apply_pm_distortions(
                                     all_refs[s].numpy(), "all"
                                 )
                             ]
                         sigs = [all_refs[s].numpy(), all_outs[s].numpy()] + dists
                         lbls = ["ref", "out"] + [f"d{i}" for i in range(len(dists))]
+                        speaker_signals[s] = sigs
+                        speaker_labels[s] = [f"{s}-{l}" for l in lbls]
+                    all_embeddings = {}
+                    for s in speaker_signals:
+                        sigs = speaker_signals[s]
+                        masks = [multi_speaker_mask] * len(sigs)
                         batch_size = min(2, BATCH_SIZE)
                         embeddings_list = []
+                        for i in range(0, len(sigs), batch_size):
+                            batch_sigs = sigs[i:i + batch_size]
+                            batch_masks = masks[i:i + batch_size]
                             batch_embs = embed_batch(
                                 batch_sigs,
                             torch.cuda.empty_cache()
                         if embeddings_list:
+                            all_embeddings[s] = torch.cat(embeddings_list, dim=0)
+                        else:
+                            all_embeddings[s] = torch.empty(0, 0, 0)
+                    if not all_embeddings or all(e.numel() == 0 for e in all_embeddings.values()):
+                        if verbose:
+                            print(f"WARNING: mixture {mixture_id} produced 0 frames after masking; skipping.")
+                        continue
+                    L = next(iter(all_embeddings.values())).shape[1] if all_embeddings else 0
+                    if L == 0:
                         if verbose:
+                            print(f"WARNING: mixture {mixture_id} produced 0 frames after masking; skipping.")
                         continue
+                    if verbose:
+                        print(f"Computing {metric_type} scores for {mname}...")
+                    with ThreadPoolExecutor(
+                            max_workers=min(2, ngpu if ngpu > 0 else 1)
+                    ) as executor:
+                        def process_frame(f, frame_idx, all_embeddings_dict, speaker_labels_dict, individual_masks_list,
+                                          speaker_indices):
+                            try:
+                                active_speakers = []
+                                for spk_idx, spk_id in enumerate(speaker_indices):
+                                    if individual_masks_list[spk_idx][frame_idx]:
+                                        active_speakers.append(spk_id)
+                                if len(active_speakers) < 2:
+                                    return frame_idx, metric_type, {}, None, None
+                                frame_embeddings = []
+                                frame_labels = []
+                                for spk_id in active_speakers:
+                                    spk_embs = all_embeddings_dict[spk_id][:, f, :]
+                                    frame_embeddings.append(spk_embs)
+                                    frame_labels.extend(speaker_labels_dict[spk_id])
+                                frame_emb = torch.cat(frame_embeddings, dim=0).detach().cpu().numpy()
+                                if add_ci:
+                                    coords_d, coords_c, eigvals, k_sub_gauss = (
+                                        gpu_distributor.execute_on_gpu(
+                                            diffusion_map_torch,
+                                            frame_emb,
+                                            frame_labels,
+                                            alpha=alpha,
+                                            eig_solver="full",
+                                            return_eigs=True,
+                                            return_complement=True,
+                                            return_cval=add_ci,
+                                        )
+                                    )
+                                else:
+                                    coords_d = gpu_distributor.execute_on_gpu(
+                                        diffusion_map_torch,
+                                        frame_emb,
+                                        frame_labels,
+                                        alpha=alpha,
+                                        eig_solver="full",
+                                        return_eigs=False,
+                                        return_complement=False,
+                                        return_cval=False,
+                                    )
+                                    coords_c = None
+                                    eigvals = None
+                                    k_sub_gauss = 1
+                                if metric_type == "PS":
+                                    score = compute_ps(
+                                        coords_d, frame_labels, max_gpus
+                                    )
+                                    bias = prob = None
+                                    if add_ci:
+                                        bias, prob = ps_ci_components_full(
+                                            coords_d,
+                                            coords_c,
+                                            eigvals,
+                                            frame_labels,
+                                            delta=DEFAULT_DELTA_CI,
+                                        )
+                                    return frame_idx, "PS", score, bias, prob
+                                else:
+                                    score = compute_pm(
+                                        coords_d, frame_labels, "gamma", max_gpus
+                                    )
+                                    bias = prob = None
+                                    if add_ci:
+                                        bias, prob = pm_ci_components_full(
+                                            coords_d,
+                                            coords_c,
+                                            eigvals,
+                                            frame_labels,
+                                            delta=DEFAULT_DELTA_CI,
+                                            K=k_sub_gauss,
+                                        )
+                                    return frame_idx, "PM", score, bias, prob
+                            except Exception as ex:
+                                if verbose:
+                                    print(f"ERROR frame {frame_idx}: {ex}")
+                                return None
+                        speaker_ids = [e["id"] for e in speakers_this_mix]
+                        futures = [
+                            executor.submit(
+                                process_frame,
+                                f,
+                                valid_frame_indices[f],
+                                all_embeddings,
+                                speaker_labels,
+                                individual_masks,
+                                speaker_ids
+                            )
+                            for f in range(L)
+                        ]
+                        for fut in futures:
+                            result = fut.result()
+                            if result is None:
+                                continue
+                            frame_idx, metric, score, bias, prob = result
+                            if metric == "PS":
+                                for sp in mixture_speakers:
+                                    if sp in score:
+                                        ps_frames[mname][sp][frame_idx] = score[sp]
+                                        if add_ci and bias is not None and sp in bias:
+                                            ps_bias_frames[mname][sp][frame_idx] = bias[sp]
+                                            ps_prob_frames[mname][sp][frame_idx] = prob[sp]
+                            else:
+                                for sp in mixture_speakers:
+                                    if sp in score:
+                                        pm_frames[mname][sp][frame_idx] = score[sp]
+                                        if add_ci and bias is not None and sp in bias:
+                                            pm_bias_frames[mname][sp][frame_idx] = bias[sp]
+                                            pm_prob_frames[mname][sp][frame_idx] = prob[sp]
+                    clear_gpu_memory()
+                    gc.collect()
                     del model_wrapper
                     clear_gpu_memory()
                     gc.collect()
             all_mixture_results[mixture_id][algo][mname] = {
                 'ps_frames': ps_frames[mname],
                 'pm_frames': pm_frames[mname],
                 'total_frames': total_frames
             }
         if verbose:
+            print(f"Saving results for mixture {mixture_id}...")
         timestamps_ms = [i * hop * 1000 / SR for i in range(total_frames)]
         for model in models:
             ps_data = {'timestamp_ms': timestamps_ms}
             pm_data = {'timestamp_ms': timestamps_ms}
             ci_data = {'timestamp_ms': timestamps_ms} if add_ci else None
             for algo in algos_to_run:
                 if algo not in all_mixture_results[mixture_id]:
                     continue
                 model_data = all_mixture_results[mixture_id][algo][model]
                 for speaker in mixture_speakers:
                     col_name = f"{algo}_{speaker}"
                     ps_data[col_name] = model_data['ps_frames'][speaker]
                         ci_data[f"{algo}_{speaker}_pm_bias"] = model_data['pm_bias_frames'][speaker]
                         ci_data[f"{algo}_{speaker}_pm_prob"] = model_data['pm_prob_frames'][speaker]
             mixture_dir = os.path.join(exp_root, mixture_id)
             os.makedirs(mixture_dir, exist_ok=True)
     print(f"\nEXPERIMENT COMPLETED")
     print(f"Results saved to: {exp_root}")
+    del all_refs, multi_speaker_masks_mix, individual_speaker_masks_mix
     from models import cleanup_all_models
     cleanup_all_models()

main.py CHANGED Viewed

@@ -1,3 +1,11 @@
 from __future__ import annotations
 from pathlib import Path
 from engine import compute_mapss_measures

+"""
+Entry point from the CLI into the MAPSS calculation.
+IMPORTANT: Output files must be provided in the same order as reference files.
+For example, if references are ["ref1.wav", "ref2.wav"],
+then outputs must be ["out1.wav", "out2.wav"] in that exact order.
+"""
 from __future__ import annotations
 from pathlib import Path
 from engine import compute_mapss_measures

metrics.py CHANGED Viewed

@@ -1,17 +1,20 @@
 import math
 import numpy as np
 import torch
 from scipy.special import gammaincc
 from scipy.stats import gamma
-from config import COV_TOL, DEFAULT_DELTA_CI
 from utils import get_gpu_count, mahalanobis_torch, safe_cov_torch
 def pm_tail_gamma(d_out_sq, sq_dists):
-    """PM tail gamma exactly as original."""
     mu = sq_dists.mean().item()
     var = sq_dists.var(unbiased=True).item()
     if var == 0.0:
@@ -22,7 +25,9 @@ def pm_tail_gamma(d_out_sq, sq_dists):
 def pm_tail_rank(d_out_sq, sq_dists):
-    """PM tail rank exactly as original."""
     rank = int((sq_dists < d_out_sq).sum().item())
     n = sq_dists.numel()
     return 1.0 - (rank + 0.5) / (n + 1.0)
@@ -43,7 +48,23 @@ def diffusion_map_torch(
     return_complement=False,
     return_cval=False,
 ):
-    """Diffusion map computation exactly as original."""
     device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
     X = torch.as_tensor(X_np, dtype=torch.float32, device=device)
     N = X.shape[0]
@@ -141,6 +162,13 @@ def diffusion_map_torch(
 def compute_ps(coords, labels, max_gpus=None):
     ngpu = get_gpu_count(max_gpus)
     if ngpu == 0:
@@ -171,8 +199,7 @@ def compute_ps(coords, labels, max_gpus=None):
             out[s] = (1 - A / (A + B_min + 1e-6)).item()
         return out
-    # GPU version
-    device = min(ngpu - 1, 1)  # Use second GPU if available
     device_str = f"cuda:{device}"
     coords_t = torch.tensor(coords, device=device_str)
     spks_here = sorted({l.split("-")[0] for l in labels})
@@ -212,6 +239,14 @@ def compute_ps(coords, labels, max_gpus=None):
 def compute_pm(coords, labels, pm_method, max_gpus=None):
     ngpu = get_gpu_count(max_gpus)
     if ngpu == 0:
@@ -245,7 +280,6 @@ def compute_pm(coords, labels, pm_method, max_gpus=None):
             out[s] = float(np.clip(pm_score, 0.0, 1.0))
         return out
-    # GPU version
     device = min(ngpu - 1, 1)
     device_str = f"cuda:{device}"
     coords_t = torch.tensor(coords, device=device_str)
@@ -287,7 +321,18 @@ def compute_pm(coords, labels, pm_method, max_gpus=None):
 def pm_ci_components_full(
     coords_d, coords_rest, eigvals, labels, *, delta=0.05, K=1.0, C1=1.0, C2=1.0
 ):
-    """PM CI components exactly as original - complete implementation."""
     _EPS = 1e-12
     def _safe_x(a, theta):
@@ -387,7 +432,6 @@ def pm_ci_components_full(
         bias_ci[s] = max(abs(v - pm_center) for v in corner_vals)
-        # Probabilistic half-width
         R_sq = float(mah_sq.max()) + 1e-12
         log_term = math.log(6.0 / delta)
         eps_mu = math.sqrt(2 * sigma2_g * log_term / n_p) + 3 * R_sq * log_term / n_p
@@ -419,7 +463,15 @@ def pm_ci_components_full(
 def ps_ci_components_full(coords_d, coords_rest, eigvals, labels, *, delta=0.05):
-    """PS CI components exactly as original - complete implementation."""
     def _mean_dev(lam_max, delta, n_eff):
         return math.sqrt(2 * lam_max * math.log(2 / delta) / n_eff)

 import math
 import numpy as np
 import torch
 from scipy.special import gammaincc
 from scipy.stats import gamma
+from config import COV_TOL
 from utils import get_gpu_count, mahalanobis_torch, safe_cov_torch
 def pm_tail_gamma(d_out_sq, sq_dists):
+    """
+    Computes the PM measure based on the Gamma fit.
+    :param d_out_sq: squared mahalanobis distance from the output to its cluster on the manifold.
+    :param sq_dists: squared mahalanobis distance of all distortions in the cluster to their cluster on the manifold.
+    :return: PM score.
+    """
     mu = sq_dists.mean().item()
     var = sq_dists.var(unbiased=True).item()
     if var == 0.0:
 def pm_tail_rank(d_out_sq, sq_dists):
+    """
+    A depracted method to compute the PM measure based on the ranking method of distances.
+    """
     rank = int((sq_dists < d_out_sq).sum().item())
     n = sq_dists.numel()
     return 1.0 - (rank + 0.5) / (n + 1.0)
     return_complement=False,
     return_cval=False,
 ):
+    """
+    Compute diffusion maps from a high dimensional set of points.
+    :param X_np: high dimensional input.
+    :param labels_by_mix: used to keep track of each source's coordinates on the manifold.
+    :param cutoff: the desired ratio between sum of kept and sum of all eigenvalues.
+    :param tol: deprecated since we do not use the "lobpcg" solver.
+    :param diffusion_time: number of steps taken on the probability transition matrix.
+    :param alpha: normalization factor in [0, 1].
+    :param eig_solver: "lobpcg" or "full".
+    :param k: pre-defined truncation dimension.
+    :param device: "cpu" or "cuda".
+    :param return_eigs: return eigenvalues and eigenvectors.
+    :param return_complement: return complementary coordinates, not just kept coordinates.
+    :param return_cval: calculate and return the psi_2 norm of the coordinates.
+    :return:
+    """
     device = device or ("cuda:0" if torch.cuda.is_available() else "cpu")
     X = torch.as_tensor(X_np, dtype=torch.float32, device=device)
     N = X.shape[0]
 def compute_ps(coords, labels, max_gpus=None):
+    """
+    Computes the PS measure.
+    :param coords: coordinates on the manifold.
+    :param labels: assign source index per coordinate.
+    :param max_gpus: maximal number of GPUs to use.
+    :return: the PS measure.
+    """
     ngpu = get_gpu_count(max_gpus)
     if ngpu == 0:
             out[s] = (1 - A / (A + B_min + 1e-6)).item()
         return out
+    device = min(ngpu - 1, 1)
     device_str = f"cuda:{device}"
     coords_t = torch.tensor(coords, device=device_str)
     spks_here = sorted({l.split("-")[0] for l in labels})
 def compute_pm(coords, labels, pm_method, max_gpus=None):
+    """
+    Computes the PM measure.
+    :param coords: coordinates on the manifold.
+    :param labels: assign source index per coordinate.
+    :param pm_method: "rank" or "gamma".
+    :param max_gpus: maximal number of GPUs to use.
+    :return: the PS measure.
+    """
     ngpu = get_gpu_count(max_gpus)
     if ngpu == 0:
             out[s] = float(np.clip(pm_score, 0.0, 1.0))
         return out
     device = min(ngpu - 1, 1)
     device_str = f"cuda:{device}"
     coords_t = torch.tensor(coords, device=device_str)
 def pm_ci_components_full(
     coords_d, coords_rest, eigvals, labels, *, delta=0.05, K=1.0, C1=1.0, C2=1.0
 ):
+    """
+    Computes the error radius and tail bounds for the PM measure.
+    :param coords_d: Retained diffusion maps coordinates.
+    :param coords_rest: Complement diffusion maps coordinates.
+    :param eigvals: Eigenvalues of the diffusion maps.
+    :param labels: Assign source index per coordinate
+    :param delta: 1-\delta is the confidence score.
+    :param K: Absolute constant.
+    :param C1: Absolute constant.
+    :param C2: Absolute constant.
+    :return: error radius and tail bounds for the PM measure.
+    """
     _EPS = 1e-12
     def _safe_x(a, theta):
         bias_ci[s] = max(abs(v - pm_center) for v in corner_vals)
         R_sq = float(mah_sq.max()) + 1e-12
         log_term = math.log(6.0 / delta)
         eps_mu = math.sqrt(2 * sigma2_g * log_term / n_p) + 3 * R_sq * log_term / n_p
 def ps_ci_components_full(coords_d, coords_rest, eigvals, labels, *, delta=0.05):
+    """
+    Computes the error radius and tail bounds for the PS measure.
+    :param coords_d: Retained diffusion maps coordinates.
+    :param coords_rest: Complement diffusion maps coordinates.
+    :param eigvals: Eigenvalues of the diffusion maps.
+    :param labels: Assign source index per coordinate
+    :param delta: 1-\delta is the confidence score.
+    :return: error radius and tail bounds for the PS measure.
+    """
     def _mean_dev(lam_max, delta, n_eff):
         return math.sqrt(2 * lam_max * math.log(2 / delta) / n_eff)

models.py CHANGED Viewed

@@ -15,8 +15,10 @@ from config import BATCH_SIZE, ENERGY_HOP_MS, ENERGY_WIN_MS, SR
 from utils import get_gpu_count
-class BalancedDualGPUModel:
     def __init__(self, model_name, layer, max_gpus=None):
         self.layer = layer
         self.models = []
@@ -24,7 +26,7 @@ class BalancedDualGPUModel:
         self.devices = []
         ngpu = get_gpu_count(max_gpus)
-        for gpu_id in range(min(ngpu, 2)):
             device = f"cuda:{gpu_id}"
             self.devices.append(device)
             ckpt, cls, _ = get_model_config(layer)[model_name]
@@ -90,7 +92,6 @@ class BalancedDualGPUModel:
                     mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
                     keep.append(hs[b][mask_t].cpu())
-                # Aggressive cleanup
                 del hs, input_values, inputs
                 torch.cuda.empty_cache()
@@ -106,7 +107,6 @@ class BalancedDualGPUModel:
             except Exception as e:
                 self.result_queue.put((task_id, e))
             finally:
-                # Always clear cache after processing
                 torch.cuda.empty_cache()
     def process_batch(self, signals, masks, use_mlm=False):
@@ -150,8 +150,12 @@ class BalancedDualGPUModel:
         self.cleanup()
-# NO CACHE - we need to clean up models properly between runs
 def get_model_config(layer):
     return {
         "raw": (None, None, None),
         "wavlm": ("microsoft/wavlm-large", WavLMModel, layer),
@@ -164,21 +168,25 @@ def get_model_config(layer):
     }
-# Store loaded models globally to properly manage them
 _loaded_models = {}
 def load_model(name, layer, max_gpus=None):
     global _loaded_models
-    # Clean up any previously loaded models first
     if _loaded_models:
         for key, model_data in _loaded_models.items():
             if isinstance(model_data, tuple) and len(model_data) == 2:
-                if isinstance(model_data[0], BalancedDualGPUModel):
                     model_data[0].cleanup()
                 elif isinstance(model_data[0], tuple):
-                    # Single GPU model
                     _, model = model_data[0]
                     del model
         _loaded_models.clear()
@@ -190,7 +198,7 @@ def load_model(name, layer, max_gpus=None):
     ngpu = get_gpu_count(max_gpus)
     if ngpu > 1:
-        model = BalancedDualGPUModel(name, layer, max_gpus)
         _loaded_models[name] = (model, layer)
         return model, layer
     else:
@@ -219,15 +227,16 @@ def load_model(name, layer, max_gpus=None):
 def cleanup_all_models():
-    """Call this at the end of each experiment to ensure complete cleanup"""
     global _loaded_models
     if _loaded_models:
         for key, model_data in _loaded_models.items():
             if isinstance(model_data, tuple) and len(model_data) == 2:
-                if isinstance(model_data[0], BalancedDualGPUModel):
                     model_data[0].cleanup()
                 elif isinstance(model_data[0], tuple):
-                    # Single GPU model
                     _, model = model_data[0]
                     del model
         _loaded_models.clear()
@@ -236,6 +245,12 @@ def cleanup_all_models():
 def embed_batch_raw(signals, masks_audio):
     win = int(ENERGY_WIN_MS * SR / 1000)
     hop = int(ENERGY_HOP_MS * SR / 1000)
     reps, L_max = [], 0
@@ -253,6 +268,9 @@ def embed_batch_raw(signals, masks_audio):
 def embed_batch_single_gpu(
         signals, masks_audio, extractor, model, layer, use_mlm=False
 ):
     if not signals:
         return torch.empty(0, 0, 0)
     device = next(model.parameters()).device
@@ -281,7 +299,6 @@ def embed_batch_single_gpu(
             mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
             all_keeps.append(hs[b][mask_t].cpu())
-        # Aggressive cleanup
         del hs, input_values, inputs
         torch.cuda.empty_cache()
@@ -289,7 +306,6 @@ def embed_batch_single_gpu(
         L_max = max(x.shape[0] for x in all_keeps)
         keep_padded = [F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in all_keeps]
         result = torch.stack(keep_padded, dim=0)
-        # Clean up intermediate lists
         del all_keeps, keep_padded
         return result
     else:
@@ -297,9 +313,19 @@ def embed_batch_single_gpu(
 def embed_batch(signals, masks_audio, model_wrapper, layer, use_mlm=False):
     if model_wrapper == "raw":
         return embed_batch_raw(signals, masks_audio)
-    if isinstance(model_wrapper, BalancedDualGPUModel):
         all_embeddings = []
         batch_size = min(BATCH_SIZE, 2)
         for i in range(0, len(signals), batch_size):
@@ -308,7 +334,6 @@ def embed_batch(signals, masks_audio, model_wrapper, layer, use_mlm=False):
             )
             if batch_emb.numel() > 0:
                 all_embeddings.append(batch_emb)
-            # Clear cache after each batch
             torch.cuda.empty_cache()
         if all_embeddings:

 from utils import get_gpu_count
+class BalancedMultiGPUModel:
+    """
+    Distributes model inference workload across GPUs.
+    """
     def __init__(self, model_name, layer, max_gpus=None):
         self.layer = layer
         self.models = []
         self.devices = []
         ngpu = get_gpu_count(max_gpus)
+        for gpu_id in range(ngpu):
             device = f"cuda:{gpu_id}"
             self.devices.append(device)
             ckpt, cls, _ = get_model_config(layer)[model_name]
                     mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
                     keep.append(hs[b][mask_t].cpu())
                 del hs, input_values, inputs
                 torch.cuda.empty_cache()
             except Exception as e:
                 self.result_queue.put((task_id, e))
             finally:
                 torch.cuda.empty_cache()
     def process_batch(self, signals, masks, use_mlm=False):
         self.cleanup()
 def get_model_config(layer):
+    """
+    Get self-supervised model configuration.
+    :param layer: specific transformer layer to choose.
+    :return: Configuration.
+    """
     return {
         "raw": (None, None, None),
         "wavlm": ("microsoft/wavlm-large", WavLMModel, layer),
     }
 _loaded_models = {}
 def load_model(name, layer, max_gpus=None):
+    """
+    Load the chosen self-supervised model.
+    :param name: name of model.
+    :param layer: chosen layer.
+    :param max_gpus: maximal gpus to use.
+    :return: extractor, model, and layer.
+    """
     global _loaded_models
     if _loaded_models:
         for key, model_data in _loaded_models.items():
             if isinstance(model_data, tuple) and len(model_data) == 2:
+                if isinstance(model_data[0], BalancedMultiGPUModel):
                     model_data[0].cleanup()
                 elif isinstance(model_data[0], tuple):
                     _, model = model_data[0]
                     del model
         _loaded_models.clear()
     ngpu = get_gpu_count(max_gpus)
     if ngpu > 1:
+        model = BalancedMultiGPUModel(name, layer, max_gpus)
         _loaded_models[name] = (model, layer)
         return model, layer
     else:
 def cleanup_all_models():
+    """
+    Call this at the end of each experiment to ensure complete cleanup
+    """
     global _loaded_models
     if _loaded_models:
         for key, model_data in _loaded_models.items():
             if isinstance(model_data, tuple) and len(model_data) == 2:
+                if isinstance(model_data[0], BalancedMultiGPUModel):
                     model_data[0].cleanup()
                 elif isinstance(model_data[0], tuple):
                     _, model = model_data[0]
                     del model
         _loaded_models.clear()
 def embed_batch_raw(signals, masks_audio):
+    """
+    Waveform encoding in case it was chosen to skip self-supervised encording and push waveform directly to diffusion maps
+    :param signals: waveform signals.
+    :param masks_audio: voice activity masks of sources.
+    :return:
+    """
     win = int(ENERGY_WIN_MS * SR / 1000)
     hop = int(ENERGY_HOP_MS * SR / 1000)
     reps, L_max = [], 0
 def embed_batch_single_gpu(
         signals, masks_audio, extractor, model, layer, use_mlm=False
 ):
+    """
+    See embed_batch.
+    """
     if not signals:
         return torch.empty(0, 0, 0)
     device = next(model.parameters()).device
             mask_t = F.interpolate(mask_b, size=T, mode="nearest")[0, 0].bool()
             all_keeps.append(hs[b][mask_t].cpu())
         del hs, input_values, inputs
         torch.cuda.empty_cache()
         L_max = max(x.shape[0] for x in all_keeps)
         keep_padded = [F.pad(x, (0, 0, 0, L_max - x.shape[0])) for x in all_keeps]
         result = torch.stack(keep_padded, dim=0)
         del all_keeps, keep_padded
         return result
     else:
 def embed_batch(signals, masks_audio, model_wrapper, layer, use_mlm=False):
+    """
+    Encode a batch of signals using the self-supervised model chosen.
+    :param signals: waveform signals to encode.
+    :param masks_audio: voice activity masks of sources.
+    :param model_wrapper: chosen model's wrapper.
+    :param layer: transformer layer.
+    :param use_mlm: deprecated.
+    :return: embedded signal representations by the model's layer.
+    """
     if model_wrapper == "raw":
         return embed_batch_raw(signals, masks_audio)
+    if isinstance(model_wrapper, BalancedMultiGPUModel):
         all_embeddings = []
         batch_size = min(BATCH_SIZE, 2)
         for i in range(0, len(signals), batch_size):
             )
             if batch_emb.numel() > 0:
                 all_embeddings.append(batch_emb)
             torch.cuda.empty_cache()
         if all_embeddings:

utils.py CHANGED Viewed

@@ -3,18 +3,16 @@ import threading
 import warnings
 from dataclasses import dataclass
 from pathlib import Path
 import numpy as np
 import torch
-try:
-    from scipy.optimize import linear_sum_assignment as _lsa
-except Exception:
-    _lsa = None
 warnings.filterwarnings("ignore", message="Some weights of Wav2Vec2Model")
 def get_gpu_count(max_gpus=None):
     ngpu = torch.cuda.device_count()
     if max_gpus is not None:
         ngpu = min(ngpu, max_gpus)
@@ -22,7 +20,9 @@ def get_gpu_count(max_gpus=None):
 def clear_gpu_memory():
-    """Enhanced GPU memory clearing"""
     if torch.cuda.is_available():
         for i in range(torch.cuda.device_count()):
             with torch.cuda.device(i):
@@ -33,11 +33,15 @@ def clear_gpu_memory():
 def get_gpu_memory_info(verbose=False):
     if not verbose:
         return
     for i in range(torch.cuda.device_count()):
         try:
-            free_b, total_b = torch.cuda.mem_get_info(i)  # type: ignore[attr-defined]
             free_gb = free_b / 1024**3
             total_gb = total_b / 1024**3
         except Exception:
@@ -47,60 +51,10 @@ def get_gpu_memory_info(verbose=False):
         print(f"GPU {i}: {mem_allocated:.2f}GB allocated, {free_gb:.2f}GB free / {total_gb:.2f}GB total")
-def write_wav_16bit(path, x, sr=16000):
-    path = Path(path)
-    path.parent.mkdir(parents=True, exist_ok=True)
-    try:
-        import soundfile as sf
-        sf.write(str(path), x.astype(np.float32), sr)
-    except Exception:
-        from scipy.io.wavfile import write
-        write(str(path), sr, (np.clip(x, -1, 1) * 32767).astype(np.int16))
-def safe_corr_np(a, b):
-    L = min(len(a), len(b))
-    if L <= 1:
-        return 0.0
-    a = a[:L].astype(np.float64)
-    b = b[:L].astype(np.float64)
-    a -= a.mean()
-    b -= b.mean()
-    da = a.std()
-    db = b.std()
-    if da <= 1e-12 or db <= 1e-12:
-        return 0.0
-    r = float((a * b).mean() / (da * db))
-    return max(-1.0, min(1.0, r))
-def hungarian(cost):
-    try:
-        if _lsa is not None:
-            return _lsa(cost)
-        raise RuntimeError("scipy.optimize.linear_sum_assignment unavailable")
-    except Exception:
-        used = set()
-        rows, cols = [], []
-        for i in range(cost.shape[0]):
-            j = int(
-                np.argmin(
-                    [
-                        cost[i, k] if k not in used else 1e12
-                        for k in range(cost.shape[1])
-                    ]
-                )
-            )
-            used.add(j)
-            rows.append(i)
-            cols.append(j)
-        return np.asarray(rows), np.asarray(cols)
 class GPUWorkDistributor:
     def __init__(self, max_gpus=None):
         ngpu = get_gpu_count(max_gpus)
         self.gpu_locks = [threading.Lock() for _ in range(max(1, min(ngpu, 2)))]
@@ -121,7 +75,6 @@ class GPUWorkDistributor:
                 with torch.cuda.device(gid):
                     kwargs["device"] = f"cuda:{gid}"
                     result = func(*args, **kwargs)
-                    # Clear cache after execution
                     torch.cuda.empty_cache()
                 return result
             finally:
@@ -189,35 +142,12 @@ def canonicalize_mixtures(mixtures, systems=None):
     raise ValueError("Unsupported 'mixtures' format.")
-def random_misalign(sig, sr, max_ms, mode="single", rng=None):
-    import random
-    if rng is None:
-        rng = random
-    max_samples = int(sr * max_ms / 1000)
-    if max_samples == 0:
-        return sig
-    shift = (
-        rng.randint(-max_samples, max_samples) if mode == "range" else int(max_samples)
-    )
-    if shift == 0:
-        return sig
-    if isinstance(sig, torch.Tensor):
-        z = torch.zeros(abs(shift), dtype=sig.dtype, device=sig.device)
-        return (
-            torch.cat([z, sig[:-shift]]) if shift > 0 else torch.cat([sig[-shift:], z])
-        )
-    else:
-        z = np.zeros(abs(shift), dtype=sig.dtype)
-        return (
-            np.concatenate([z, sig[:-shift]])
-            if shift > 0
-            else np.concatenate([sig[-shift:], z])
-        )
 def safe_cov_torch(X):
     Xc = X - X.mean(dim=0, keepdim=True)
     cov = Xc.T @ Xc / (Xc.shape[0] - 1)
     if torch.linalg.matrix_rank(cov) < cov.shape[0]:
@@ -226,6 +156,13 @@ def safe_cov_torch(X):
 def mahalanobis_torch(x, mu, inv):
     diff = x - mu
     diff_T = diff.transpose(-1, -2) if diff.ndim >= 2 else diff
     return torch.sqrt(diff @ inv @ diff_T + 1e-6)

 import warnings
 from dataclasses import dataclass
 from pathlib import Path
 import numpy as np
 import torch
 warnings.filterwarnings("ignore", message="Some weights of Wav2Vec2Model")
 def get_gpu_count(max_gpus=None):
+    """
+    Get the number of available GPUs.
+    :param max_gpus: maximal number of GPUs to utilize.
+    """
     ngpu = torch.cuda.device_count()
     if max_gpus is not None:
         ngpu = min(ngpu, max_gpus)
 def clear_gpu_memory():
+    """
+    Enhanced GPU memory clearing
+    """
     if torch.cuda.is_available():
         for i in range(torch.cuda.device_count()):
             with torch.cuda.device(i):
 def get_gpu_memory_info(verbose=False):
+    """
+    Get GPU memory info.
+    :param verbose: if True, get info.
+    """
     if not verbose:
         return
     for i in range(torch.cuda.device_count()):
         try:
+            free_b, total_b = torch.cuda.mem_get_info(i)
             free_gb = free_b / 1024**3
             total_gb = total_b / 1024**3
         except Exception:
         print(f"GPU {i}: {mem_allocated:.2f}GB allocated, {free_gb:.2f}GB free / {total_gb:.2f}GB total")
 class GPUWorkDistributor:
+    """
+    Distribute GPU memory into multiple GPUs.
+    """
     def __init__(self, max_gpus=None):
         ngpu = get_gpu_count(max_gpus)
         self.gpu_locks = [threading.Lock() for _ in range(max(1, min(ngpu, 2)))]
                 with torch.cuda.device(gid):
                     kwargs["device"] = f"cuda:{gid}"
                     result = func(*args, **kwargs)
                     torch.cuda.empty_cache()
                 return result
             finally:
     raise ValueError("Unsupported 'mixtures' format.")
 def safe_cov_torch(X):
+    """
+    Compute the covariance matrix of X.
+    :param X: array to compute covariance matrix of.
+    :return: regularized covariance matrix.
+    """
     Xc = X - X.mean(dim=0, keepdim=True)
     cov = Xc.T @ Xc / (Xc.shape[0] - 1)
     if torch.linalg.matrix_rank(cov) < cov.shape[0]:
 def mahalanobis_torch(x, mu, inv):
+    """
+    Compute the mahalanobis distance with x centered around mu with inverse covariance matrix inv.
+    :param x: point to calculates distance from.
+    :param mu: x is centered around mu.
+    :param inv: the inverse covariance matrix.
+    :return: Mahalanobis distance.
+    """
     diff = x - mu
     diff_T = diff.transpose(-1, -2) if diff.ndim >= 2 else diff
     return torch.sqrt(diff @ inv @ diff_T + 1e-6)