seniruk/sinscribe-sinhala-stt
Updated โข 14 โข 1
How to use seniruk/whisper-small-si with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition", model="seniruk/whisper-small-si") # Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
processor = AutoProcessor.from_pretrained("seniruk/whisper-small-si")
model = AutoModelForSpeechSeq2Seq.from_pretrained("seniruk/whisper-small-si")Iโm a AI undergraduate and an AI enthusiast, working on machine learning projects and open-source contributions.
I enjoy exploring AI pipelines, natural language processing, and building tools that make development easier.
This model is a fine-tuned version of openai/whisper-small on the Sinhala CSV + FLACs dataset. It achieves the following results on the evaluation set:
Can be used for Sinhala speech to text conversions. Make sure to input noise low audio to the model, to get the best outcome.
Trained on the custom dataset - seniruk/sinscribe-sinhala-stt
Trained on above final dataset with 2 epochs on a device with below spec for 41:00:59 hours
| Training Loss | Epoch | Step | Validation Loss | Wer |
|---|---|---|---|---|
| 0.1871 | 0.1102 | 1000 | 0.1834 | 51.9170 |
| 0.1429 | 0.2204 | 2000 | 0.1517 | 44.7541 |
| 0.1345 | 0.3307 | 3000 | 0.1336 | 41.0627 |
| 0.1183 | 0.4409 | 4000 | 0.1237 | 38.6625 |
| 0.114 | 0.5511 | 5000 | 0.1151 | 36.9654 |
| 0.1056 | 0.6613 | 6000 | 0.1080 | 35.2670 |
| 0.0968 | 0.7715 | 7000 | 0.1037 | 34.4457 |
| 0.1011 | 0.8817 | 8000 | 0.0986 | 33.2741 |
| 0.0971 | 0.9920 | 9000 | 0.0961 | 32.7147 |
| 0.0713 | 1.1022 | 10000 | 0.0947 | 32.0250 |
| 0.0706 | 1.2124 | 11000 | 0.0940 | 32.0766 |
| 0.0691 | 1.3226 | 12000 | 0.0907 | 31.2485 |
| 0.0684 | 1.4328 | 13000 | 0.0893 | 30.9512 |
| 0.0718 | 1.5430 | 14000 | 0.0875 | 30.3592 |
| 0.0642 | 1.6533 | 15000 | 0.0859 | 30.0388 |
| 0.0667 | 1.7635 | 16000 | 0.0842 | 29.5840 |
| 0.0667 | 1.8737 | 17000 | 0.0835 | 29.3193 |
| 0.0677 | 1.9839 | 18000 | 0.0829 | 29.1387 |
import torchaudio
from transformers import pipeline
import torch
device = 0 if torch.cuda.is_available() else -1
from transformers import pipeline
pipe = pipeline("automatic-speech-recognition", model="seniruk/whisper-small-si",device=device)
def transcribe(audio_path):
if audio_path is None:
return "No audio received. Please record something."
waveform, sample_rate = torchaudio.load(audio_path)
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
array = waveform.squeeze().numpy()
result = pipe({"array": array, "sampling_rate": sample_rate})
return result["text"]
result= transcribe("audio.wav")
print(result)
import torchaudio
from transformers import pipeline
import gradio as gr
import torch
# === Setup ===
device = 0 if torch.cuda.is_available() else -1
MAX_DURATION_SECONDS = 30 # Limit audio length to 30 seconds
# Load fine-tuned Whisper pipeline
pipe = pipeline(
"automatic-speech-recognition",
model="seniruk/whisper-small-si",
device=device
)
def transcribe(audio_path):
try:
if audio_path is None:
return "No audio received. Please record or upload a file."
# Load and prepare audio
waveform, sample_rate = torchaudio.load(audio_path)
# Convert to mono
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Duration check
duration = waveform.shape[1] / sample_rate
if duration > MAX_DURATION_SECONDS:
return f"Audio too long ({duration:.1f}s). Please use a clip shorter than {MAX_DURATION_SECONDS}s."
# Convert to numpy array
array = waveform.squeeze().numpy()
# Run inference
result = pipe({"array": array, "sampling_rate": sample_rate})
return result.get("text", "No transcription returned.")
except Exception as e:
return f"Error during transcription: {e}"
# === Gradio Interface ===
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio"),
outputs=gr.Textbox(label="Transcription"),
title="Whisper Small Sinhala (GPU/CPU)",
description=(
"Sinhala speech-to-text demo using a fine-tuned Whisper Small model(Sinscribe) "
"Supports microphone recording or file upload (max 30 seconds)"
),
)
iface.launch()
Base model
openai/whisper-small