|
|
import json
|
|
|
from pathlib import Path
|
|
|
import sentencepiece as spm
|
|
|
import logging
|
|
|
from typing import List, Dict
|
|
|
import shutil
|
|
|
|
|
|
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO,
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
|
)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TokenizerTrainer:
|
|
|
def __init__(self):
|
|
|
self.data_dir = Path('data/raw')
|
|
|
self.output_dir = Path('outputs/tokenizer')
|
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
self.vocab_size = 32000
|
|
|
self.character_coverage = 0.9999
|
|
|
self.model_type = "unigram"
|
|
|
self.special_tokens = [
|
|
|
"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
|
|
|
"<s>", "</s>", "<pad>", "<unk>", "<mask>",
|
|
|
"২০", "১০", "৫০", "১৫", "২৫",
|
|
|
"def", "class", "return", "if", "else", "for", "while",
|
|
|
"print", "input", "import", "from", "try", "except",
|
|
|
"#", "//", "/*", "*/", "'''", '"""'
|
|
|
]
|
|
|
|
|
|
def prepare_training_data(self) -> str:
|
|
|
"""Prepare text data for tokenizer training"""
|
|
|
logger.info("Preparing training data for tokenizer")
|
|
|
|
|
|
|
|
|
try:
|
|
|
with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
|
|
|
data = json.load(f)
|
|
|
except FileNotFoundError:
|
|
|
logger.error("Processed data file not found. Run data collection first.")
|
|
|
raise
|
|
|
|
|
|
|
|
|
train_file = self.output_dir / 'train.txt'
|
|
|
with open(train_file, 'w', encoding='utf-8') as f:
|
|
|
for item in data:
|
|
|
text = item['text']
|
|
|
|
|
|
sentences = text.split('।')
|
|
|
for sentence in sentences:
|
|
|
sentence = sentence.strip()
|
|
|
if sentence:
|
|
|
f.write(sentence + '\n')
|
|
|
|
|
|
logger.info("Training data prepared successfully")
|
|
|
return str(train_file)
|
|
|
|
|
|
def train_tokenizer(self, train_file: str):
|
|
|
"""Train the SentencePiece tokenizer"""
|
|
|
logger.info("Starting tokenizer training")
|
|
|
|
|
|
|
|
|
model_prefix = self.output_dir / "bengali_code"
|
|
|
|
|
|
|
|
|
params = {
|
|
|
"--input": train_file,
|
|
|
"--model_prefix": str(model_prefix),
|
|
|
"--vocab_size": str(self.vocab_size),
|
|
|
"--character_coverage": str(self.character_coverage),
|
|
|
"--model_type": self.model_type,
|
|
|
"--pad_id": 0,
|
|
|
"--unk_id": 1,
|
|
|
"--bos_id": 2,
|
|
|
"--eos_id": 3,
|
|
|
"--user_defined_symbols": ",".join(self.special_tokens),
|
|
|
"--max_sentence_length": "4192",
|
|
|
"--input_sentence_size": "5000000",
|
|
|
"--shuffle_input_sentence": "true",
|
|
|
"--normalization_rule_name": "identity"
|
|
|
}
|
|
|
|
|
|
|
|
|
args = []
|
|
|
for key, value in params.items():
|
|
|
args.append(key)
|
|
|
args.append(value)
|
|
|
|
|
|
try:
|
|
|
|
|
|
spm.SentencePieceTrainer.train(" ".join(args))
|
|
|
logger.info("Tokenizer training completed successfully")
|
|
|
|
|
|
|
|
|
self.create_huggingface_files(model_prefix)
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to train tokenizer: {str(e)}")
|
|
|
raise
|
|
|
|
|
|
def create_huggingface_files(self, model_prefix: Path):
|
|
|
"""Create additional files needed for HuggingFace compatibility"""
|
|
|
logger.info("Creating HuggingFace compatibility files")
|
|
|
|
|
|
|
|
|
tokenizer_config = {
|
|
|
"model_max_length": 2048,
|
|
|
"padding_side": "right",
|
|
|
"truncation_side": "right",
|
|
|
"bos_token": "<s>",
|
|
|
"eos_token": "</s>",
|
|
|
"unk_token": "<unk>",
|
|
|
"pad_token": "<pad>",
|
|
|
"mask_token": "<mask>",
|
|
|
"model_type": self.model_type,
|
|
|
"vocab_size": self.vocab_size
|
|
|
}
|
|
|
|
|
|
with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
|
|
|
json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
|
|
|
special_tokens_map = {
|
|
|
"bos_token": "<s>",
|
|
|
"eos_token": "</s>",
|
|
|
"unk_token": "<unk>",
|
|
|
"pad_token": "<pad>",
|
|
|
"mask_token": "<mask>"
|
|
|
}
|
|
|
|
|
|
with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
|
|
|
json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
logger.info("HuggingFace compatibility files created successfully")
|
|
|
|
|
|
def train(self):
|
|
|
"""Main method to train the tokenizer"""
|
|
|
try:
|
|
|
|
|
|
train_file = self.prepare_training_data()
|
|
|
|
|
|
|
|
|
self.train_tokenizer(train_file)
|
|
|
|
|
|
|
|
|
if Path(train_file).exists():
|
|
|
Path(train_file).unlink()
|
|
|
|
|
|
logger.info("Tokenizer training pipeline completed successfully")
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Tokenizer training pipeline failed: {str(e)}")
|
|
|
raise
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
trainer = TokenizerTrainer()
|
|
|
trainer.train()
|
|
|
|