mujib71 / scripts /tokenizer_trainer.py
likhonsheikhdev's picture
Upload 28 files
cce70aa verified
import json
from pathlib import Path
import sentencepiece as spm
import logging
from typing import List, Dict
import shutil
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class TokenizerTrainer:
def __init__(self):
self.data_dir = Path('data/raw')
self.output_dir = Path('outputs/tokenizer')
self.output_dir.mkdir(parents=True, exist_ok=True)
# Tokenizer configuration
self.vocab_size = 32000
self.character_coverage = 0.9999
self.model_type = "unigram"
self.special_tokens = [
"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
"<s>", "</s>", "<pad>", "<unk>", "<mask>",
"২০", "১০", "৫০", "১৫", "২৫", # Common Bengali numbers
"def", "class", "return", "if", "else", "for", "while", # Code keywords
"print", "input", "import", "from", "try", "except",
"#", "//", "/*", "*/", "'''", '"""' # Code comments
]
def prepare_training_data(self) -> str:
"""Prepare text data for tokenizer training"""
logger.info("Preparing training data for tokenizer")
# Load processed data
try:
with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
logger.error("Processed data file not found. Run data collection first.")
raise
# Create temporary file for training
train_file = self.output_dir / 'train.txt'
with open(train_file, 'w', encoding='utf-8') as f:
for item in data:
text = item['text']
# Write one sentence per line
sentences = text.split('।') # Split on Bengali full stop
for sentence in sentences:
sentence = sentence.strip()
if sentence: # Skip empty sentences
f.write(sentence + '\n')
logger.info("Training data prepared successfully")
return str(train_file)
def train_tokenizer(self, train_file: str):
"""Train the SentencePiece tokenizer"""
logger.info("Starting tokenizer training")
# Prepare model prefix
model_prefix = self.output_dir / "bengali_code"
# Create training parameters
params = {
"--input": train_file,
"--model_prefix": str(model_prefix),
"--vocab_size": str(self.vocab_size),
"--character_coverage": str(self.character_coverage),
"--model_type": self.model_type,
"--pad_id": 0,
"--unk_id": 1,
"--bos_id": 2,
"--eos_id": 3,
"--user_defined_symbols": ",".join(self.special_tokens),
"--max_sentence_length": "4192",
"--input_sentence_size": "5000000",
"--shuffle_input_sentence": "true",
"--normalization_rule_name": "identity" # Preserve original text
}
# Convert parameters to command-line arguments
args = []
for key, value in params.items():
args.append(key)
args.append(value)
try:
# Train the tokenizer
spm.SentencePieceTrainer.train(" ".join(args))
logger.info("Tokenizer training completed successfully")
# Create config files for HuggingFace compatibility
self.create_huggingface_files(model_prefix)
except Exception as e:
logger.error(f"Failed to train tokenizer: {str(e)}")
raise
def create_huggingface_files(self, model_prefix: Path):
"""Create additional files needed for HuggingFace compatibility"""
logger.info("Creating HuggingFace compatibility files")
# Create tokenizer config
tokenizer_config = {
"model_max_length": 2048,
"padding_side": "right",
"truncation_side": "right",
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>",
"model_type": self.model_type,
"vocab_size": self.vocab_size
}
with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
# Create special tokens map
special_tokens_map = {
"bos_token": "<s>",
"eos_token": "</s>",
"unk_token": "<unk>",
"pad_token": "<pad>",
"mask_token": "<mask>"
}
with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
logger.info("HuggingFace compatibility files created successfully")
def train(self):
"""Main method to train the tokenizer"""
try:
# Prepare training data
train_file = self.prepare_training_data()
# Train tokenizer
self.train_tokenizer(train_file)
# Clean up temporary files
if Path(train_file).exists():
Path(train_file).unlink()
logger.info("Tokenizer training pipeline completed successfully")
except Exception as e:
logger.error(f"Tokenizer training pipeline failed: {str(e)}")
raise
if __name__ == "__main__":
trainer = TokenizerTrainer()
trainer.train()