MAPSS-measures / app.py
AIvry's picture
Upload 11 files
b759ccc verified
raw
history blame
13.4 kB
import gradio as gr
import zipfile
import shutil
from pathlib import Path
import json
import os
import traceback
import gc
import torch
import spaces
# Import your modules
from engine import compute_mapss_measures
from models import get_model_config, cleanup_all_models
from config import DEFAULT_ALPHA
from utils import clear_gpu_memory
@spaces.GPU(duration=300)
def process_audio_files(zip_file, model_name, layer, alpha):
"""Process uploaded ZIP file containing audio mixtures."""
if zip_file is None:
return None, "Please upload a ZIP file"
try:
# Use a fixed extraction path
extract_path = Path("/tmp/mapss_extract")
if extract_path.exists():
shutil.rmtree(extract_path)
extract_path.mkdir(parents=True)
# Extract ZIP
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
zip_ref.extractall(extract_path)
# Find references and outputs directories
refs_dir = None
outs_dir = None
for item in extract_path.iterdir():
if item.is_dir():
if item.name.lower() in ['references', 'refs', 'reference']:
refs_dir = item
elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
outs_dir = item
# Check one level deeper if not found
if refs_dir is None or outs_dir is None:
for item in extract_path.iterdir():
if item.is_dir():
for subitem in item.iterdir():
if subitem.is_dir():
if subitem.name.lower() in ['references', 'refs', 'reference']:
refs_dir = subitem
elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
outs_dir = subitem
if refs_dir is None or outs_dir is None:
return None, "Could not find 'references' and 'outputs' directories in the ZIP file"
# Get audio files
ref_files = sorted([f for f in refs_dir.glob("*.wav")])
out_files = sorted([f for f in outs_dir.glob("*.wav")])
if len(ref_files) == 0:
return None, "No reference WAV files found"
if len(out_files) == 0:
return None, "No output WAV files found"
if len(ref_files) != len(out_files):
return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order."
# Create manifest
manifest = [{
"mixture_id": "uploaded_mixture",
"references": [str(f) for f in ref_files],
"systems": {
"uploaded_system": [str(f) for f in out_files]
}
}]
# Validate model
allowed_models = set(get_model_config(0).keys())
if model_name not in allowed_models:
return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}"
# Set layer
if model_name == "raw":
layer_final = 0
else:
model_defaults = {
"wavlm": 24, "wav2vec2": 24, "hubert": 24,
"wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
"wav2vec2_xlsr": 24
}
layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
# Check GPU availability - use all available GPUs on the space
max_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0
# Run experiment
results_dir = compute_mapss_measures(
models=[model_name],
mixtures=manifest,
layer=layer_final,
alpha=alpha,
verbose=True,
max_gpus=max_gpus,
add_ci=False # Disable CI for faster processing in demo
)
# Create output ZIP at a fixed location
output_zip = Path("/tmp/mapss_results.zip")
with zipfile.ZipFile(output_zip, 'w') as zipf:
results_path = Path(results_dir)
files_added = 0
# Add all files from results
for file_path in results_path.rglob("*"):
if file_path.is_file():
arcname = str(file_path.relative_to(results_path.parent))
zipf.write(file_path, arcname)
files_added += 1
if output_zip.exists() and files_added > 0:
return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
else:
return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
except Exception as e:
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
return None, error_msg
finally:
cleanup_all_models()
clear_gpu_memory()
gc.collect()
def create_interface():
with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
gr.Markdown("""
# MAPSS: Manifold-based Assessment of Perceptual Source Separation
Granular evaluation of speech and music source separation with the MAPSS measures:
- **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
- **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
## ⚠️ IMPORTANT: File Order Requirements
**Output files MUST be in the same order as reference files!**
- If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
- Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
- Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.
## Input Format
Upload a ZIP file containing:
```
your_mixture.zip
├── references/ # Original clean sources
│ ├── speaker1.wav
│ ├── speaker2.wav
│ └── ...
└── outputs/ # Separated outputs (SAME ORDER as references)
├── separated1.wav # Must correspond to speaker1.wav
├── separated2.wav # Must correspond to speaker2.wav
└── ...
```
### Audio Requirements
- Format: .wav files
- Sample rate: Any (automatically resampled to 16kHz)
- Channels: Mono or stereo (converted to mono)
- **Number of files: Equal number of references and outputs**
- **Order: Output files must be in the same order as reference files**
## Output Format
The tool generates a ZIP file containing:
- `ps_scores_{model}.csv`: PS scores for each source over time
- `pm_scores_{model}.csv`: PM scores for each source over time
- `params.json`: Parameters used
- `manifest_canonical.json`: File mapping and processing details
### Score Interpretation
- **NaN values**: Appear in frames where fewer than 2 speakers are active
- **Valid scores**: Only computed when at least 2 speakers are active in a frame
- **Time resolution**: 20ms frames (configurable in code)
## Available Models
| Model | Description | Default Layer | Use Case |
|-------|-------------|---------------|----------|
| `raw` | Raw waveform features | N/A | Baseline comparison |
| `wavlm` | WavLM Large | 24 | Strong performance |
| `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
| `hubert` | HuBERT Large | 24 | Good for speech |
| `wavlm_base` | WavLM Base | 12 | Faster processing |
| `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
| `hubert_base` | HuBERT Base | 12 | Faster processing |
| `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
## Parameters
- **Model**: Select the embedding model for feature extraction
- **Layer**: Which transformer layer to use (auto-selected by default)
- **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
- 0.0 = No normalization
- 1.0 = Full normalization (recommended)
## Processing Notes
- The system automatically detects which speakers are active in each frame
- PS/PM scores are only computed between active speakers
- Processing time scales with number of sources and audio length
- GPU acceleration is automatically used when available
## Citation
If you use MAPSS, please cite:
```bibtex
@article{Ivry2025MAPSS,
title = {MAPSS: Manifold-based Assessment of Perceptual Source Separation},
author = {Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
journal = {arXiv preprint arXiv:2509.09212},
year = {2025},
url = {https://arxiv.org/abs/2509.09212}
}
```
## License
Code: MIT License
Paper: CC-BY-4.0
## Support
For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
""")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload ZIP file with audio mixtures",
file_types=[".zip"],
type="filepath"
)
model_dropdown = gr.Dropdown(
choices=["raw", "wavlm", "wav2vec2", "hubert",
"wavlm_base", "wav2vec2_base", "hubert_base",
"wav2vec2_xlsr"],
value="wav2vec2_base",
label="Select embedding model"
)
layer_slider = gr.Slider(
minimum=0,
maximum=12,
step=1,
value=12,
label="Layer (automatically set to model default)",
interactive=True
)
alpha_slider = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.1,
value=DEFAULT_ALPHA,
label="Diffusion maps alpha parameter"
)
def update_layer_slider(model_name):
"""Update layer slider based on selected model"""
model_configs = {
"raw": {"maximum": 0, "value": 0, "interactive": False},
"wavlm": {"maximum": 24, "value": 24, "interactive": True},
"wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
"hubert": {"maximum": 24, "value": 24, "interactive": True},
"wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
"wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
"wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
"hubert_base": {"maximum": 12, "value": 12, "interactive": True}
}
config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
return gr.Slider(
minimum=0,
maximum=config["maximum"],
value=config["value"],
step=1,
label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
interactive=config["interactive"]
)
model_dropdown.change(
fn=update_layer_slider,
inputs=[model_dropdown],
outputs=[layer_slider]
)
process_btn = gr.Button("Process Audio Files", variant="primary")
with gr.Column():
output_file = gr.File(
label="Download Results (ZIP)",
type="filepath"
)
status_text = gr.Textbox(
label="Status",
lines=3,
max_lines=10
)
process_btn.click(
fn=process_audio_files,
inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
outputs=[output_file, status_text]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()