danielrosehill's picture
Add academic citation for DeepFilterNet2
2657417
import gradio as gr
import json
from pathlib import Path
import difflib
# Load STT comparison results
with open("audio_samples/stt_comparison_results.json", "r") as f:
stt_results = json.load(f)
# Extract transcripts
original_transcript = stt_results["original"]["text"]
enhanced_transcript = stt_results["enhanced"]["text"]
def generate_diff_html(text1, text2):
"""Generate HTML markup showing word-level differences between two texts."""
# Split into words while preserving spaces
words1 = text1.split(' ')
words2 = text2.split(' ')
# Create a SequenceMatcher for word-level comparison
sm = difflib.SequenceMatcher(None, words1, words2)
html_output = []
for tag, i1, i2, j1, j2 in sm.get_opcodes():
if tag == 'equal':
html_output.append(' '.join(words1[i1:i2]))
elif tag == 'delete':
deleted_text = ' '.join(words1[i1:i2])
html_output.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{deleted_text}</span>')
elif tag == 'insert':
inserted_text = ' '.join(words2[j1:j2])
html_output.append(f'<span style="background-color: #ccffcc;">{inserted_text}</span>')
elif tag == 'replace':
deleted_text = ' '.join(words1[i1:i2])
inserted_text = ' '.join(words2[j1:j2])
html_output.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{deleted_text}</span>')
html_output.append(f'<span style="background-color: #ccffcc;">{inserted_text}</span>')
# Add space between segments (except at the end)
if tag != 'equal' or i2 < len(words1):
html_output.append(' ')
return ''.join(html_output).strip()
# Generate diff visualization
diff_html = generate_diff_html(original_transcript, enhanced_transcript)
# Create the Gradio interface
with gr.Blocks(title="Baby Noise Cancellation Demo") as demo:
gr.Markdown("""
# Baby Noise Cancellation Demo
This demo showcases the effectiveness of AI-powered noise removal for cleaning audio recordings
with baby crying in the background. The goal is to produce clean enough audio for successful
speech-to-text (STT) transcription.
## Use Case
Parents using voice technology often face challenges when children start fussing during dictation.
This demo tests whether deep learning models can remove baby noise while preserving speech quality
for accurate STT transcription.
""")
with gr.Row():
with gr.Column():
gr.Markdown("### Original Audio (with baby crying)")
original_audio = gr.Audio(
value="audio_samples/original-note.mp3",
label="Original Recording",
type="filepath"
)
gr.Markdown("#### Whisper STT Transcript (Original)")
original_text = gr.Textbox(
value=original_transcript,
label="Transcript",
lines=10,
max_lines=15
)
with gr.Column():
gr.Markdown("### DeepFilterNet2 Processed Audio")
enhanced_audio = gr.Audio(
value="audio_samples/enhanced_output.mp3",
label="Noise-Cancelled Recording",
type="filepath"
)
gr.Markdown("#### Whisper STT Transcript (Enhanced)")
enhanced_text = gr.Textbox(
value=enhanced_transcript,
label="Transcript",
lines=10,
max_lines=15
)
gr.Markdown("""
## Results
Compare the two audio samples and their transcripts:
- **Left**: Original recording with baby crying in the background
- **Right**: AI-processed audio with baby noise significantly reduced
The transcripts show that both versions are successfully transcribed by Whisper,
but the enhanced version provides clearer audio quality with reduced background noise.
### Transcript Differences
Below is a character-level comparison of the two transcripts:
- <span style="background-color: #ffcccc; text-decoration: line-through;">Red with strikethrough</span>: Text in original transcript only
- <span style="background-color: #ccffcc;">Green highlight</span>: Text in enhanced transcript only
""")
gr.HTML(value=f'<div style="padding: 15px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9; font-family: monospace; line-height: 1.8; white-space: pre-wrap;">{diff_html}</div>', label="Transcript Comparison")
gr.Markdown("""
## Technology
This demonstration uses **DeepFilterNet2** for audio noise removal, processing the audio to isolate
and preserve speech while suppressing baby crying frequencies.
- **DeepFilterNet2**: Official repository at [Rikorose/DeepFilterNet](https://github.com/Rikorose/DeepFilterNet)
- **Processing Space**: Audio processed using [drewThomasson/DeepFilterNet2_no_limit](https://huggingface.co/spaces/drewThomasson/DeepFilterNet2_no_limit)
### Citation
```bibtex
@inproceedings{schroeter2022deepfilternet2,
title = {{DeepFilterNet2}: Towards Real-Time Speech Enhancement on Embedded Devices for Full-Band Audio},
author = {Schröter, Hendrik and Escalante-B., Alberto N. and Rosenkranz, Tobias and Maier, Andreas},
booktitle={17th International Workshop on Acoustic Signal Enhancement (IWAENC 2022)},
year = {2022},
}
```
## About
**Audio recorded by**: Daniel Rosehill - October 28th, 2025
Created by [Daniel Rosehill](https://danielrosehill.com) ([GitHub](https://github.com/danielrosehill)) to explore practical solutions for voice technology in real-world parenting scenarios.
---
*This is a proof-of-concept demonstration showing AI-powered audio cleaning for practical
voice technology applications.*
""")
if __name__ == "__main__":
demo.launch()