Spaces:

AIvry
/

MAPSS-measures

Sleeping

App Files Files Community

MAPSS-measures / app.py

AIvry

Upload 11 files

236f287 verified 3 months ago

raw

history blame

11.9 kB

	import gradio as gr
	import zipfile
	import shutil
	from pathlib import Path
	import json
	import os
	import traceback
	import gc
	import torch
	import spaces

	# Import your modules
	from engine import compute_mapss_measures
	from models import get_model_config, cleanup_all_models
	from config import DEFAULT_ALPHA
	from utils import clear_gpu_memory

	@spaces.GPU(duration=300)
	def process_audio_files(zip_file, model_name, layer, alpha):
	"""Process uploaded ZIP file containing audio mixtures."""

	if zip_file is None:
	return None, "Please upload a ZIP file"

	try:
	# Use a fixed extraction path
	extract_path = Path("/tmp/mapss_extract")
	if extract_path.exists():
	shutil.rmtree(extract_path)
	extract_path.mkdir(parents=True)

	# Extract ZIP
	with zipfile.ZipFile(zip_file.name, 'r') as zip_ref:
	zip_ref.extractall(extract_path)

	# Find references and outputs directories
	refs_dir = None
	outs_dir = None

	for item in extract_path.iterdir():
	if item.is_dir():
	if item.name.lower() in ['references', 'refs', 'reference']:
	refs_dir = item
	elif item.name.lower() in ['outputs', 'outs', 'output', 'separated']:
	outs_dir = item

	# Check one level deeper if not found
	if refs_dir is None or outs_dir is None:
	for item in extract_path.iterdir():
	if item.is_dir():
	for subitem in item.iterdir():
	if subitem.is_dir():
	if subitem.name.lower() in ['references', 'refs', 'reference']:
	refs_dir = subitem
	elif subitem.name.lower() in ['outputs', 'outs', 'output', 'separated']:
	outs_dir = subitem

	if refs_dir is None or outs_dir is None:
	return None, "Could not find 'references' and 'outputs' directories in the ZIP file"

	# Get audio files
	ref_files = sorted([f for f in refs_dir.glob("*.wav")])
	out_files = sorted([f for f in outs_dir.glob("*.wav")])

	if len(ref_files) == 0:
	return None, "No reference WAV files found"
	if len(out_files) == 0:
	return None, "No output WAV files found"

	# Create manifest
	manifest = [{
	"mixture_id": "uploaded_mixture",
	"references": [str(f) for f in ref_files],
	"systems": {
	"uploaded_system": [str(f) for f in out_files]
	}
	}]

	# Validate model
	allowed_models = set(get_model_config(0).keys())
	if model_name not in allowed_models:
	return None, f"Invalid model. Allowed: {', '.join(sorted(allowed_models))}"

	# Set layer
	if model_name == "raw":
	layer_final = 0
	else:
	model_defaults = {
	"wavlm": 24, "wav2vec2": 24, "hubert": 24,
	"wavlm_base": 12, "wav2vec2_base": 12, "hubert_base": 12,
	"wav2vec2_xlsr": 24
	}
	layer_final = layer if layer is not None else model_defaults.get(model_name, 12)

	# Check GPU availability
	max_gpus = 1 if torch.cuda.is_available() else 0

	# Run experiment
	results_dir = compute_mapss_measures(
	models=[model_name],
	mixtures=manifest,
	layer=layer_final,
	alpha=alpha,
	verbose=True,
	max_gpus=max_gpus,
	add_ci=False
	)

	# Create output ZIP at a fixed location
	output_zip = Path("/tmp/mapss_results.zip")

	with zipfile.ZipFile(output_zip, 'w') as zipf:
	results_path = Path(results_dir)
	files_added = 0

	# Add all files from results
	for file_path in results_path.rglob("*"):
	if file_path.is_file():
	arcname = str(file_path.relative_to(results_path.parent))
	zipf.write(file_path, arcname)
	files_added += 1

	if output_zip.exists() and files_added > 0:
	return str(output_zip), f"Processing completed! Created ZIP with {files_added} files."
	else:
	return None, f"Processing completed but no output files were generated. Check if embeddings were computed."

	except Exception as e:
	error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
	return None, error_msg

	finally:
	cleanup_all_models()
	clear_gpu_memory()
	gc.collect()

	def create_interface():
	with gr.Blocks(title="MAPSS - Multi-source Audio Perceptual Separation Scores") as demo:
	gr.Markdown("""
	# MAPSS: Manifold-based Assessment of Perceptual Source Separation

	Granular evaluation of speech and music source separation with the MAPSS measures:
	- Perceptual Matching (PM): Measures how closely an output perceptually aligns with its reference. Range: 0-1, higher is better.
	- Perceptual Similarity (PS): Measures how well an output is separated from its interfering references. Range: 0-1, higher is better.

	## Input Format

	Upload a ZIP file containing:
	```
	your_mixture.zip
	├── references/ # Original clean sources
	│ ├── speaker1.wav
	│ ├── speaker2.wav
	│ └── ...
	└── outputs/ # Separated outputs from your algorithm
	├── separated1.wav
	├── separated2.wav
	└── ...
	```

	### Audio Requirements
	- Format: .wav files
	- Sample rate: Any (automatically resampled to 16kHz)
	- Channels: Mono or stereo (converted to mono)
	- Number of files: Equal number of references and outputs

	## Output Format

	The tool generates a ZIP file containing:
	- `ps_scores_{model}.csv`: PS scores for each source
	- `pm_scores_{model}.csv`: PM scores for each source
	- `params.json`: Parameters used
	- `manifest_canonical.json`: File mapping and processing details

	## Available Models

	\| Model \| Description \| Default Layer \| Use Case \|
	\|-------\|-------------\|---------------\|----------\|
	\| `raw` \| Raw waveform features \| N/A \| Baseline comparison \|
	\| `wavlm` \| WavLM Large \| 24 \| Strong performance \|
	\| `wav2vec2` \| Wav2Vec2 Large \| 24 \| Best overall performance \|
	\| `hubert` \| HuBERT Large \| 24 \| \|
	\| `wavlm_base` \| WavLM Base \| 12 \| \|
	\| `wav2vec2_base` \| Wav2Vec2 Base \| 12 \| Faster, good quality \|
	\| `hubert_base` \| HuBERT Base \| 12 \| \|
	\| `wav2vec2_xlsr` \| Wav2Vec2 XLSR-53 \| 24 \| Multilingual \|

	## Parameters

	- Model: Select the embedding model for feature extraction
	- Layer: Which transformer layer to use (auto-selected by default)
	- Alpha: Diffusion maps parameter (0.0-1.0, default: 1.0)
	- 0.0 = No normalization
	- 1.0 = Full normalization (recommended)

	## Citation

	If you use MAPSS, please cite:

	```bibtex
	@article{Ivry2025MAPSS,
	title = {MAPSS: Manifold-based Assessment of Perceptual Source Separation},
	author = {Ivry, Amir and Cornell, Samuele and Watanabe, Shinji},
	journal = {arXiv preprint arXiv:2509.09212},
	year = {2025},
	url = {https://arxiv.org/abs/2509.09212}
	}
	```

	## Limitations

	- Processing time scales with number of sources, audio length and model size

	## License

	Code: MIT License
	Paper: CC-BY-4.0

	## Support

	For issues, questions, or contributions, please visit the [GitHub repository](https://github.com/amir-ivry/MAPSS-measures).
	""")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="Upload ZIP file with audio mixtures",
	file_types=[".zip"],
	type="filepath"
	)

	model_dropdown = gr.Dropdown(
	choices=["raw", "wavlm", "wav2vec2", "hubert",
	"wavlm_base", "wav2vec2_base", "hubert_base",
	"wav2vec2_xlsr"],
	value="wav2vec2_base",
	label="Select embedding model"
	)

	layer_slider = gr.Slider(
	minimum=0,
	maximum=12,
	step=1,
	value=12,
	label="Layer (automatically set to model default)",
	interactive=True
	)

	alpha_slider = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=DEFAULT_ALPHA,
	label="Diffusion maps alpha parameter"
	)

	def update_layer_slider(model_name):
	"""Update layer slider based on selected model"""
	model_configs = {
	"raw": {"maximum": 0, "value": 0, "interactive": False},
	"wavlm": {"maximum": 24, "value": 24, "interactive": True},
	"wav2vec2": {"maximum": 24, "value": 24, "interactive": True},
	"hubert": {"maximum": 24, "value": 24, "interactive": True},
	"wav2vec2_xlsr": {"maximum": 24, "value": 24, "interactive": True},
	"wavlm_base": {"maximum": 12, "value": 12, "interactive": True},
	"wav2vec2_base": {"maximum": 12, "value": 12, "interactive": True},
	"hubert_base": {"maximum": 12, "value": 12, "interactive": True}
	}

	config = model_configs.get(model_name, {"maximum": 12, "value": 12, "interactive": True})
	return gr.Slider(
	minimum=0,
	maximum=config["maximum"],
	value=config["value"],
	step=1,
	label=f"Layer (max: {config['maximum']}, default: {config['value']})" if config["interactive"] else "Layer (not applicable for raw features)",
	interactive=config["interactive"]
	)

	model_dropdown.change(
	fn=update_layer_slider,
	inputs=[model_dropdown],
	outputs=[layer_slider]
	)

	process_btn = gr.Button("Process Audio Files", variant="primary")

	with gr.Column():
	output_file = gr.File(
	label="Download Results (ZIP)",
	type="filepath"
	)
	status_text = gr.Textbox(
	label="Status",
	lines=3,
	max_lines=10
	)

	process_btn.click(
	fn=process_audio_files,
	inputs=[file_input, model_dropdown, layer_slider, alpha_slider],
	outputs=[output_file, status_text]
	)

	return demo

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch()