Spaces:

danielrosehill
/

Deepnet-Baby-Noise-Scrub

Running

App Files Files Community

Deepnet-Baby-Noise-Scrub / app.py

danielrosehill

Add academic citation for DeepFilterNet2

2657417 4 months ago

raw

history blame contribute delete

6 kB

	import gradio as gr
	import json
	from pathlib import Path
	import difflib

	# Load STT comparison results
	with open("audio_samples/stt_comparison_results.json", "r") as f:
	stt_results = json.load(f)

	# Extract transcripts
	original_transcript = stt_results["original"]["text"]
	enhanced_transcript = stt_results["enhanced"]["text"]

	def generate_diff_html(text1, text2):
	"""Generate HTML markup showing word-level differences between two texts."""
	# Split into words while preserving spaces
	words1 = text1.split(' ')
	words2 = text2.split(' ')

	# Create a SequenceMatcher for word-level comparison
	sm = difflib.SequenceMatcher(None, words1, words2)

	html_output = []

	for tag, i1, i2, j1, j2 in sm.get_opcodes():
	if tag == 'equal':
	html_output.append(' '.join(words1[i1:i2]))
	elif tag == 'delete':
	deleted_text = ' '.join(words1[i1:i2])
	html_output.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{deleted_text}</span>')
	elif tag == 'insert':
	inserted_text = ' '.join(words2[j1:j2])
	html_output.append(f'<span style="background-color: #ccffcc;">{inserted_text}</span>')
	elif tag == 'replace':
	deleted_text = ' '.join(words1[i1:i2])
	inserted_text = ' '.join(words2[j1:j2])
	html_output.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{deleted_text}</span>')
	html_output.append(f'<span style="background-color: #ccffcc;">{inserted_text}</span>')

	# Add space between segments (except at the end)
	if tag != 'equal' or i2 < len(words1):
	html_output.append(' ')

	return ''.join(html_output).strip()

	# Generate diff visualization
	diff_html = generate_diff_html(original_transcript, enhanced_transcript)

	# Create the Gradio interface
	with gr.Blocks(title="Baby Noise Cancellation Demo") as demo:
	gr.Markdown("""
	# Baby Noise Cancellation Demo

	This demo showcases the effectiveness of AI-powered noise removal for cleaning audio recordings
	with baby crying in the background. The goal is to produce clean enough audio for successful
	speech-to-text (STT) transcription.

	## Use Case
	Parents using voice technology often face challenges when children start fussing during dictation.
	This demo tests whether deep learning models can remove baby noise while preserving speech quality
	for accurate STT transcription.
	""")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Original Audio (with baby crying)")
	original_audio = gr.Audio(
	value="audio_samples/original-note.mp3",
	label="Original Recording",
	type="filepath"
	)
	gr.Markdown("#### Whisper STT Transcript (Original)")
	original_text = gr.Textbox(
	value=original_transcript,
	label="Transcript",
	lines=10,
	max_lines=15
	)

	with gr.Column():
	gr.Markdown("### DeepFilterNet2 Processed Audio")
	enhanced_audio = gr.Audio(
	value="audio_samples/enhanced_output.mp3",
	label="Noise-Cancelled Recording",
	type="filepath"
	)
	gr.Markdown("#### Whisper STT Transcript (Enhanced)")
	enhanced_text = gr.Textbox(
	value=enhanced_transcript,
	label="Transcript",
	lines=10,
	max_lines=15
	)

	gr.Markdown("""
	## Results

	Compare the two audio samples and their transcripts:

	- Left: Original recording with baby crying in the background
	- Right: AI-processed audio with baby noise significantly reduced

	The transcripts show that both versions are successfully transcribed by Whisper,
	but the enhanced version provides clearer audio quality with reduced background noise.

	### Transcript Differences

	Below is a character-level comparison of the two transcripts:
	- <span style="background-color: #ffcccc; text-decoration: line-through;">Red with strikethrough</span>: Text in original transcript only
	- <span style="background-color: #ccffcc;">Green highlight</span>: Text in enhanced transcript only
	""")

	gr.HTML(value=f'<div style="padding: 15px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9; font-family: monospace; line-height: 1.8; white-space: pre-wrap;">{diff_html}</div>', label="Transcript Comparison")

	gr.Markdown("""
	## Technology

	This demonstration uses DeepFilterNet2 for audio noise removal, processing the audio to isolate
	and preserve speech while suppressing baby crying frequencies.

	- DeepFilterNet2: Official repository at [Rikorose/DeepFilterNet](https://github.com/Rikorose/DeepFilterNet)
	- Processing Space: Audio processed using [drewThomasson/DeepFilterNet2_no_limit](https://huggingface.co/spaces/drewThomasson/DeepFilterNet2_no_limit)

	### Citation

	```bibtex
	@inproceedings{schroeter2022deepfilternet2,
	title = {{DeepFilterNet2}: Towards Real-Time Speech Enhancement on Embedded Devices for Full-Band Audio},
	author = {Schröter, Hendrik and Escalante-B., Alberto N. and Rosenkranz, Tobias and Maier, Andreas},
	booktitle={17th International Workshop on Acoustic Signal Enhancement (IWAENC 2022)},
	year = {2022},
	}
	```

	## About

	Audio recorded by: Daniel Rosehill - October 28th, 2025

	Created by [Daniel Rosehill](https://danielrosehill.com) ([GitHub](https://github.com/danielrosehill)) to explore practical solutions for voice technology in real-world parenting scenarios.

	---

	*This is a proof-of-concept demonstration showing AI-powered audio cleaning for practical
	voice technology applications.*
	""")

	if __name__ == "__main__":
	demo.launch()