Spaces:

AIvry
/

MAPSS-measures

Sleeping

App Files Files Community

MAPSS-measures / app.py

AIvry

Upload 11 files

b759ccc verified 3 months ago

raw

history blame

13.4 kB

	import gradio as gr
	import zipfile
	import shutil
	from pathlib import Path
	import json
	import os
	import traceback
	import gc
	import torch
	import spaces

	# Import your modules
	from engine import compute_mapss_measures
	from models import get_model_config, cleanup_all_models
	from config import DEFAULT_ALPHA
	from utils import clear_gpu_memory

	@spaces.GPU(duration=300)
	def process_audio_files(zip_file, model_name, layer, alpha):
	"""Process uploaded ZIP file containing audio mixtures."""

	if zip_file is None:
	return None, "Please upload a ZIP file"

	try:
	# Use a fixed extraction path
	extract_path = Path("/tmp/mapss_extract")
	if extract_path.exists():
	shutil.rmtree(extract_path)
	extract_path.mkdir(parents=True)

	# Extract ZIP
	with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
	zip_ref.extractall(extract_path)

	# Find references and outputs directories
	refs_dir = None
	outs_dir = None

	for item in extract_path.iterdir():
	if item.is_dir():
	if item.name.lower() in ['references', 'refs', 'reference']:
	refs_dir = item
	elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
	outs_dir = item

	# Check one level deeper if not found
	if refs_dir is None or outs_dir is None:
	for item in extract_path.iterdir():
	if item.is_dir():
	for subitem in item.iterdir():
	if subitem.is_dir():
	if subitem.name.lower() in ['references', 'refs', 'reference']:
	refs_dir = subitem
	elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
	outs_dir = subitem

	if refs_dir is None or outs_dir is None:
	return None, "Could not find 'references' and 'outputs' directories in the ZIP file"

	# Get audio files
	ref_files = sorted([f for f in refs_dir.glob("*.wav")])
	out_files = sorted([f for f in outs_dir.glob("*.wav")])

	if len(ref_files) == 0:
	return None, "No reference WAV files found"
	if len(out_files) == 0:
	return None, "No output WAV files found"
	if len(ref_files) != len(out_files):
	return None, f"Number of reference files ({len(ref_files)}) must match number of output files ({len(out_files)}). Files must be in the same order."

	# Create manifest
	manifest = [{
	"mixture_id": "uploaded_mixture",
	"references": [str(f) for f in ref_files],
	"systems": {
	"uploaded_system": [str(f) for f in out_files]
	}
	}]

	# Validate model
	allowed_models = set(get_model_config(0).keys())
	if model_name not in allowed_models:
	return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}"

	# Set layer
	if model_name == "raw":
	layer_final = 0
	else:
	model_defaults = {
	"wavlm": 24, "wav2vec2": 24, "hubert": 24,
	"wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
	"wav2vec2_xlsr": 24
	}
	layer_final = layer if layer is not None else model_defaults.get(model_name, 12)

	# Check GPU availability - use all available GPUs on the space
	max_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 0

	# Run experiment
	results_dir = compute_mapss_measures(
	models=[model_name],
	mixtures=manifest,
	layer=layer_final,
	alpha=alpha,
	verbose=True,
	max_gpus=max_gpus,
	add_ci=False # Disable CI for faster processing in demo
	)

	# Create output ZIP at a fixed location
	output_zip = Path("/tmp/mapss_results.zip")

	with zipfile.ZipFile(output_zip, 'w') as zipf:
	results_path = Path(results_dir)
	files_added = 0

	# Add all files from results
	for file_path in results_path.rglob("*"):
	if file_path.is_file():
	arcname = str(file_path.relative_to(results_path.parent))
	zipf.write(file_path, arcname)
	files_added += 1

	if output_zip.exists() and files_added > 0:
	return str(output_zip), f"Processing completed! Created ZIP with {files_added} files. Note: Output files must be in the same order as reference files."
	else:
	return None, f"Processing completed but no output files were generated. Check if embeddings were computed."

	except Exception as e:
	error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
	return None, error_msg

	finally:
	cleanup_all_models()
	clear_gpu_memory()
	gc.collect()

	def create_interface():
	with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
	gr.Markdown("""
	# MAPSS: Manifold-based Assessment of Perceptual Source Separation

	Granular evaluation of speech and music source separation with the MAPSS measures:
	- Perceptual Matching (PM): Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
	- Perceptual Similarity (PS): Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.

	## ⚠️ IMPORTANT: File Order Requirements

	Output files MUST be in the same order as reference files!
	- If references are: `speaker1.wav`, `speaker2.wav`, `speaker3.wav`
	- Then outputs must be: `output1.wav`, `output2.wav`, `output3.wav`
	- Where `output1` corresponds to `speaker1`, `output2` to `speaker2`, etc.

	## Input Format

	Upload a ZIP file containing:
	```
	your_mixture.zip
	├── references/ # Original clean sources
	│ ├── speaker1.wav
	│ ├── speaker2.wav
	│ └── ...
	└── outputs/ # Separated outputs (SAME ORDER as references)
	├── separated1.wav # Must correspond to speaker1.wav
	├── separated2.wav # Must correspond to speaker2.wav
	└── ...
	```

	### Audio Requirements
	- Format: .wav files
	- Sample rate: Any (automatically resampled to 16kHz)
	- Channels: Mono or stereo (converted to mono)
	- Number of files: Equal number of references and outputs
	- Order: Output files must be in the same order as reference files

	## Output Format

	The tool generates a ZIP file containing:
	- `ps_scores_{model}.csv`: PS scores for each source over time
	- `pm_scores_{model}.csv`: PM scores for each source over time
	- `params.json`: Parameters used
	- `manifest_canonical.json`: File mapping and processing details

	### Score Interpretation
	- NaN values: Appear in frames where fewer than 2 speakers are active
	- Valid scores: Only computed when at least 2 speakers are active in a frame
	- Time resolution: 20ms frames (configurable in code)

	## Available Models

	\| Model \| Description \| Default Layer \| Use Case \|
	\|-------\|-------------\|---------------\|----------\|
	\| `raw` \| Raw waveform features \| N/A \| Baseline comparison \|
	\| `wavlm` \| WavLM Large \| 24 \| Strong performance \|
	\| `wav2vec2` \| Wav2Vec2 Large \| 24 \| Best overall performance \|
	\| `hubert` \| HuBERT Large \| 24 \| Good for speech \|
	\| `wavlm_base` \| WavLM Base \| 12 \| Faster processing \|
	\| `wav2vec2_base` \| Wav2Vec2 Base \| 12 \| Faster, good quality \|
	\| `hubert_base` \| HuBERT Base \| 12 \| Faster processing \|
	\| `wav2vec2_xlsr` \| Wav2Vec2 XLSR-53 \| 24 \| Multilingual \|

	## Parameters

	- Model: Select the embedding model for feature extraction
	- Layer: Which transformer layer to use (auto-selected by default)
	- Alpha: Diffusion maps parameter (0.0-1.0, default: 1.0)
	- 0.0 = No normalization
	- 1.0 = Full normalization (recommended)

	## Processing Notes

	- The system automatically detects which speakers are active in each frame
	- PS/PM scores are only computed between active speakers
	- Processing time scales with number of sources and audio length
	- GPU acceleration is automatically used when available

	## Citation

	If you use MAPSS, please cite:

	```bibtex
	@article{Ivry2025MAPSS,
	title = {MAPSS: Manifold-based Assessment of Perceptual Source Separation},
	author = {Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
	journal = {arXiv preprint arXiv:2509.09212},
	year = {2025},
	url = {https://arxiv.org/abs/2509.09212}
	}
	```

	## License

	Code: MIT License
	Paper: CC-BY-4.0

	## Support

	For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload ZIP file with audio mixtures",
	file_types=[".zip"],
	type="filepath"
	)

	model_dropdown = gr.Dropdown(
	choices=["raw", "wavlm", "wav2vec2", "hubert",
	"wavlm_base", "wav2vec2_base", "hubert_base",
	"wav2vec2_xlsr"],
	value="wav2vec2_base",
	label="Select embedding model"
	)

	layer_slider = gr.Slider(
	minimum=0,
	maximum=12,
	step=1,
	value=12,
	label="Layer (automatically set to model default)",
	interactive=True
	)

	alpha_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=DEFAULT_ALPHA,
	label="Diffusion maps alpha parameter"
	)

	def update_layer_slider(model_name):
	"""Update layer slider based on selected model"""
	model_configs = {
	"raw": {"maximum": 0, "value": 0, "interactive": False},
	"wavlm": {"maximum": 24, "value": 24, "interactive": True},
	"wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
	"hubert": {"maximum": 24, "value": 24, "interactive": True},
	"wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
	"wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
	"wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
	"hubert_base": {"maximum": 12, "value": 12, "interactive": True}
	}

	config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
	return gr.Slider(
	minimum=0,
	maximum=config["maximum"],
	value=config["value"],
	step=1,
	label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
	interactive=config["interactive"]
	)

	model_dropdown.change(
	fn=update_layer_slider,
	inputs=[model_dropdown],
	outputs=[layer_slider]
	)

	process_btn = gr.Button("Process Audio Files", variant="primary")

	with gr.Column():
	output_file = gr.File(
	label="Download Results (ZIP)",
	type="filepath"
	)
	status_text = gr.Textbox(
	label="Status",
	lines=3,
	max_lines=10
	)

	process_btn.click(
	fn=process_audio_files,
	inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
	outputs=[output_file, status_text]
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()