MAPSS-measures / app.py
AIvry's picture
Upload 11 files
236f287 verified
raw
history blame
11.9 kB
import gradio as gr
import zipfile
import shutil
from pathlib import Path
import json
import os
import traceback
import gc
import torch
import spaces
# Import your modules
from engine import compute_mapss_measures
from models import get_model_config, cleanup_all_models
from config import DEFAULT_ALPHA
from utils import clear_gpu_memory
@spaces.GPU(duration=300)
def process_audio_files(zip_file, model_name, layer, alpha):
"""Process uploaded ZIP file containing audio mixtures."""
if zip_file is None:
return None, "Please upload a ZIP file"
try:
# Use a fixed extraction path
extract_path = Path("/tmp/mapss_extract")
if extract_path.exists():
shutil.rmtree(extract_path)
extract_path.mkdir(parents=True)
# Extract ZIP
with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
zip_ref.extractall(extract_path)
# Find references and outputs directories
refs_dir = None
outs_dir = None
for item in extract_path.iterdir():
if item.is_dir():
if item.name.lower() in ['references', 'refs', 'reference']:
refs_dir = item
elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
outs_dir = item
# Check one level deeper if not found
if refs_dir is None or outs_dir is None:
for item in extract_path.iterdir():
if item.is_dir():
for subitem in item.iterdir():
if subitem.is_dir():
if subitem.name.lower() in ['references', 'refs', 'reference']:
refs_dir = subitem
elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
outs_dir = subitem
if refs_dir is None or outs_dir is None:
return None, "Could not find 'references' and 'outputs' directories in the ZIP file"
# Get audio files
ref_files = sorted([f for f in refs_dir.glob("*.wav")])
out_files = sorted([f for f in outs_dir.glob("*.wav")])
if len(ref_files) == 0:
return None, "No reference WAV files found"
if len(out_files) == 0:
return None, "No output WAV files found"
# Create manifest
manifest = [{
"mixture_id": "uploaded_mixture",
"references": [str(f) for f in ref_files],
"systems": {
"uploaded_system": [str(f) for f in out_files]
}
}]
# Validate model
allowed_models = set(get_model_config(0).keys())
if model_name not in allowed_models:
return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}"
# Set layer
if model_name == "raw":
layer_final = 0
else:
model_defaults = {
"wavlm": 24, "wav2vec2": 24, "hubert": 24,
"wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
"wav2vec2_xlsr": 24
}
layer_final = layer if layer is not None else model_defaults.get(model_name, 12)
# Check GPU availability
max_gpus = 1 if torch.cuda.is_available() else 0
# Run experiment
results_dir = compute_mapss_measures(
models=[model_name],
mixtures=manifest,
layer=layer_final,
alpha=alpha,
verbose=True,
max_gpus=max_gpus,
add_ci=False
)
# Create output ZIP at a fixed location
output_zip = Path("/tmp/mapss_results.zip")
with zipfile.ZipFile(output_zip, 'w') as zipf:
results_path = Path(results_dir)
files_added = 0
# Add all files from results
for file_path in results_path.rglob("*"):
if file_path.is_file():
arcname = str(file_path.relative_to(results_path.parent))
zipf.write(file_path, arcname)
files_added += 1
if output_zip.exists() and files_added > 0:
return str(output_zip), f"Processing completed! Created ZIP with {files_added} files."
else:
return None, f"Processing completed but no output files were generated. Check if embeddings were computed."
except Exception as e:
error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
return None, error_msg
finally:
cleanup_all_models()
clear_gpu_memory()
gc.collect()
def create_interface():
with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
gr.Markdown("""
# MAPSS: Manifold-based Assessment of Perceptual Source Separation
Granular evaluation of speech and music source separation with the MAPSS measures:
- **Perceptual Matching (PM)**: Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
- **Perceptual Similarity (PS)**: Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.
## Input Format
Upload a ZIP file containing:
```
your_mixture.zip
├── references/ # Original clean sources
│ ├── speaker1.wav
│ ├── speaker2.wav
│ └── ...
└── outputs/ # Separated outputs from your algorithm
├── separated1.wav
├── separated2.wav
└── ...
```
### Audio Requirements
- Format: .wav files
- Sample rate: Any (automatically resampled to 16kHz)
- Channels: Mono or stereo (converted to mono)
- Number of files: Equal number of references and outputs
## Output Format
The tool generates a ZIP file containing:
- `ps_scores_{model}.csv`: PS scores for each source
- `pm_scores_{model}.csv`: PM scores for each source
- `params.json`: Parameters used
- `manifest_canonical.json`: File mapping and processing details
## Available Models
| Model | Description | Default Layer | Use Case |
|-------|-------------|---------------|----------|
| `raw` | Raw waveform features | N/A | Baseline comparison |
| `wavlm` | WavLM Large | 24 | Strong performance |
| `wav2vec2` | Wav2Vec2 Large | 24 | Best overall performance |
| `hubert` | HuBERT Large | 24 | |
| `wavlm_base` | WavLM Base | 12 | |
| `wav2vec2_base` | Wav2Vec2 Base | 12 | Faster, good quality |
| `hubert_base` | HuBERT Base | 12 | |
| `wav2vec2_xlsr` | Wav2Vec2 XLSR-53 | 24 | Multilingual |
## Parameters
- **Model**: Select the embedding model for feature extraction
- **Layer**: Which transformer layer to use (auto-selected by default)
- **Alpha**: Diffusion maps parameter (0.0-1.0, default: 1.0)
- 0.0 = No normalization
- 1.0 = Full normalization (recommended)
## Citation
If you use MAPSS, please cite:
```bibtex
@article{Ivry2025MAPSS,
title = {MAPSS: Manifold-based Assessment of Perceptual Source Separation},
author = {Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
journal = {arXiv preprint arXiv:2509.09212},
year = {2025},
url = {https://arxiv.org/abs/2509.09212}
}
```
## Limitations
- Processing time scales with number of sources, audio length and model size
## License
Code: MIT License
Paper: CC-BY-4.0
## Support
For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
""")
with gr.Row():
with gr.Column():
file_input = gr.File(
label="Upload ZIP file with audio mixtures",
file_types=[".zip"],
type="filepath"
)
model_dropdown = gr.Dropdown(
choices=["raw", "wavlm", "wav2vec2", "hubert",
"wavlm_base", "wav2vec2_base", "hubert_base",
"wav2vec2_xlsr"],
value="wav2vec2_base",
label="Select embedding model"
)
layer_slider = gr.Slider(
minimum=0,
maximum=12,
step=1,
value=12,
label="Layer (automatically set to model default)",
interactive=True
)
alpha_slider = gr.Slider(
minimum=0.0,
maximum=1.0,
step=0.1,
value=DEFAULT_ALPHA,
label="Diffusion maps alpha parameter"
)
def update_layer_slider(model_name):
"""Update layer slider based on selected model"""
model_configs = {
"raw": {"maximum": 0, "value": 0, "interactive": False},
"wavlm": {"maximum": 24, "value": 24, "interactive": True},
"wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
"hubert": {"maximum": 24, "value": 24, "interactive": True},
"wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
"wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
"wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
"hubert_base": {"maximum": 12, "value": 12, "interactive": True}
}
config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
return gr.Slider(
minimum=0,
maximum=config["maximum"],
value=config["value"],
step=1,
label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
interactive=config["interactive"]
)
model_dropdown.change(
fn=update_layer_slider,
inputs=[model_dropdown],
outputs=[layer_slider]
)
process_btn = gr.Button("Process Audio Files", variant="primary")
with gr.Column():
output_file = gr.File(
label="Download Results (ZIP)",
type="filepath"
)
status_text = gr.Textbox(
label="Status",
lines=3,
max_lines=10
)
process_btn.click(
fn=process_audio_files,
inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
outputs=[output_file, status_text]
)
return demo
if __name__ == "__main__":
demo = create_interface()
demo.launch()