aayushraina
/

bpe-hindi

Text Generation

text-processing

text2text-generation

Model card Files Files and versions

bpe-hindi / app.py

aayushraina's picture

Upload 4 files

30325e1 verified over 1 year ago

history blame contribute delete

4.22 kB

	import gradio as gr
	from huggingface_hub import snapshot_download
	from hindi_bpe import HindiBPE, preprocess_hindi_text
	import pandas as pd
	import plotly.express as px
	import os

	# Download tokenizer if not exists
	if not os.path.exists("tokenizer"):
	snapshot_download(
	repo_id="aayushraina/bpe-hindi",
	local_dir="tokenizer",
	allow_patterns=["*.json"]
	)

	class TokenizerDemo:
	def __init__(self):
	self.tokenizer = HindiBPE.load_tokenizer("tokenizer")

	def tokenize_text(self, text: str) -> tuple:
	"""Tokenize text and return visualization"""
	if not text:
	return "", None, "Please enter some text"

	# Preprocess
	text = preprocess_hindi_text(text)

	# Tokenize
	tokens = self.tokenizer.encode(text)

	# Create visualization
	token_df = pd.DataFrame({
	'Token': tokens,
	'Length': [len(token) for token in tokens]
	})

	fig = px.scatter(token_df,
	x=range(len(tokens)),
	y='Length',
	hover_data=['Token'],
	title='Token Lengths in Sequence')

	# Calculate statistics
	stats = {
	'Total Tokens': len(tokens),
	'Unique Tokens': len(set(tokens)),
	'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
	'Compression Ratio': len(text) / sum(len(t) for t in tokens)
	}

	stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
	for k, v in stats.items())

	return (
	" ".join(tokens), # Tokenized text
	fig, # Visualization
	stats_str # Statistics
	)

	def decode_tokens(self, tokens_text: str) -> str:
	"""Decode space-separated tokens back to text"""
	if not tokens_text:
	return "Please tokenize some text first"
	tokens = tokens_text.split()
	return self.tokenizer.decode(tokens)

	# Create Gradio interface
	demo = TokenizerDemo()

	interface = gr.Blocks(title="Hindi BPE Tokenizer")

	with interface:
	gr.Markdown("""
	# Hindi BPE Tokenizer Demo

	This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
	Enter Hindi text to see how it gets tokenized and analyze the token distribution.

	[View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
	""")

	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Input Hindi Text",
	placeholder="हिंदी में टेक्स्ट दर्ज करें...",
	lines=5
	)
	tokenize_btn = gr.Button("Tokenize")

	with gr.Column():
	tokens_output = gr.Textbox(
	label="Tokenized Output",
	lines=5
	)
	decode_btn = gr.Button("Decode")

	original_output = gr.Textbox(
	label="Decoded Text",
	lines=5
	)

	stats_output = gr.Textbox(
	label="Tokenization Statistics",
	lines=4
	)

	plot_output = gr.Plot(
	label="Token Length Distribution"
	)

	# Set up event handlers
	tokenize_btn.click(
	fn=demo.tokenize_text,
	inputs=input_text,
	outputs=[tokens_output, plot_output, stats_output]
	)

	decode_btn.click(
	fn=demo.decode_tokens,
	inputs=tokens_output,
	outputs=original_output
	)

	# Add examples
	gr.Examples(
	examples=[
	["हिंदी भाषा बहुत सुंदर है।"],
	["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
	["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
	],
	inputs=input_text
	)

	# Launch the interface
	interface.launch()