| | import gradio as gr |
| | import json |
| | from pathlib import Path |
| | import difflib |
| |
|
| | |
| | with open("audio_samples/stt_comparison_results.json", "r") as f: |
| | stt_results = json.load(f) |
| |
|
| | |
| | original_transcript = stt_results["original"]["text"] |
| | enhanced_transcript = stt_results["enhanced"]["text"] |
| |
|
| | def generate_diff_html(text1, text2): |
| | """Generate HTML markup showing word-level differences between two texts.""" |
| | |
| | words1 = text1.split(' ') |
| | words2 = text2.split(' ') |
| |
|
| | |
| | sm = difflib.SequenceMatcher(None, words1, words2) |
| |
|
| | html_output = [] |
| |
|
| | for tag, i1, i2, j1, j2 in sm.get_opcodes(): |
| | if tag == 'equal': |
| | html_output.append(' '.join(words1[i1:i2])) |
| | elif tag == 'delete': |
| | deleted_text = ' '.join(words1[i1:i2]) |
| | html_output.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{deleted_text}</span>') |
| | elif tag == 'insert': |
| | inserted_text = ' '.join(words2[j1:j2]) |
| | html_output.append(f'<span style="background-color: #ccffcc;">{inserted_text}</span>') |
| | elif tag == 'replace': |
| | deleted_text = ' '.join(words1[i1:i2]) |
| | inserted_text = ' '.join(words2[j1:j2]) |
| | html_output.append(f'<span style="background-color: #ffcccc; text-decoration: line-through;">{deleted_text}</span>') |
| | html_output.append(f'<span style="background-color: #ccffcc;">{inserted_text}</span>') |
| |
|
| | |
| | if tag != 'equal' or i2 < len(words1): |
| | html_output.append(' ') |
| |
|
| | return ''.join(html_output).strip() |
| |
|
| | |
| | diff_html = generate_diff_html(original_transcript, enhanced_transcript) |
| |
|
| | |
| | with gr.Blocks(title="Baby Noise Cancellation Demo") as demo: |
| | gr.Markdown(""" |
| | # Baby Noise Cancellation Demo |
| | |
| | This demo showcases the effectiveness of AI-powered noise removal for cleaning audio recordings |
| | with baby crying in the background. The goal is to produce clean enough audio for successful |
| | speech-to-text (STT) transcription. |
| | |
| | ## Use Case |
| | Parents using voice technology often face challenges when children start fussing during dictation. |
| | This demo tests whether deep learning models can remove baby noise while preserving speech quality |
| | for accurate STT transcription. |
| | """) |
| |
|
| | with gr.Row(): |
| | with gr.Column(): |
| | gr.Markdown("### Original Audio (with baby crying)") |
| | original_audio = gr.Audio( |
| | value="audio_samples/original-note.mp3", |
| | label="Original Recording", |
| | type="filepath" |
| | ) |
| | gr.Markdown("#### Whisper STT Transcript (Original)") |
| | original_text = gr.Textbox( |
| | value=original_transcript, |
| | label="Transcript", |
| | lines=10, |
| | max_lines=15 |
| | ) |
| |
|
| | with gr.Column(): |
| | gr.Markdown("### DeepFilterNet2 Processed Audio") |
| | enhanced_audio = gr.Audio( |
| | value="audio_samples/enhanced_output.mp3", |
| | label="Noise-Cancelled Recording", |
| | type="filepath" |
| | ) |
| | gr.Markdown("#### Whisper STT Transcript (Enhanced)") |
| | enhanced_text = gr.Textbox( |
| | value=enhanced_transcript, |
| | label="Transcript", |
| | lines=10, |
| | max_lines=15 |
| | ) |
| |
|
| | gr.Markdown(""" |
| | ## Results |
| | |
| | Compare the two audio samples and their transcripts: |
| | |
| | - **Left**: Original recording with baby crying in the background |
| | - **Right**: AI-processed audio with baby noise significantly reduced |
| | |
| | The transcripts show that both versions are successfully transcribed by Whisper, |
| | but the enhanced version provides clearer audio quality with reduced background noise. |
| | |
| | ### Transcript Differences |
| | |
| | Below is a character-level comparison of the two transcripts: |
| | - <span style="background-color: #ffcccc; text-decoration: line-through;">Red with strikethrough</span>: Text in original transcript only |
| | - <span style="background-color: #ccffcc;">Green highlight</span>: Text in enhanced transcript only |
| | """) |
| |
|
| | gr.HTML(value=f'<div style="padding: 15px; border: 1px solid #ddd; border-radius: 5px; background-color: #f9f9f9; font-family: monospace; line-height: 1.8; white-space: pre-wrap;">{diff_html}</div>', label="Transcript Comparison") |
| |
|
| | gr.Markdown(""" |
| | ## Technology |
| | |
| | This demonstration uses **DeepFilterNet2** for audio noise removal, processing the audio to isolate |
| | and preserve speech while suppressing baby crying frequencies. |
| | |
| | - **DeepFilterNet2**: Official repository at [Rikorose/DeepFilterNet](https://github.com/Rikorose/DeepFilterNet) |
| | - **Processing Space**: Audio processed using [drewThomasson/DeepFilterNet2_no_limit](https://huggingface.co/spaces/drewThomasson/DeepFilterNet2_no_limit) |
| | |
| | ### Citation |
| | |
| | ```bibtex |
| | @inproceedings{schroeter2022deepfilternet2, |
| | title = {{DeepFilterNet2}: Towards Real-Time Speech Enhancement on Embedded Devices for Full-Band Audio}, |
| | author = {Schröter, Hendrik and Escalante-B., Alberto N. and Rosenkranz, Tobias and Maier, Andreas}, |
| | booktitle={17th International Workshop on Acoustic Signal Enhancement (IWAENC 2022)}, |
| | year = {2022}, |
| | } |
| | ``` |
| | |
| | ## About |
| | |
| | **Audio recorded by**: Daniel Rosehill - October 28th, 2025 |
| | |
| | Created by [Daniel Rosehill](https://danielrosehill.com) ([GitHub](https://github.com/danielrosehill)) to explore practical solutions for voice technology in real-world parenting scenarios. |
| | |
| | --- |
| | |
| | *This is a proof-of-concept demonstration showing AI-powered audio cleaning for practical |
| | voice technology applications.* |
| | """) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|