from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training import spaces import torch from tqdm import tqdm from transformers import BitsAndBytesConfig, AutoConfig, AutoModelForCausalLM, AutoTokenizer, TrainingArguments from trl import SFTTrainer, SFTConfig import re import gradio as gr # nf4_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_quant_type="nf4", # bnb_4bit_use_double_quant=True, # bnb_4bit_compute_dtype=torch.bfloat16 # ) model_name = "raphael-lesmana/mamba2_370_latin3" tokenizer = AutoTokenizer.from_pretrained( model_name, device_map="auto", use_cache=False, add_bos_token=True, add_eos_token=False ) model = AutoModelForCausalLM.from_pretrained( model_name, device_map='auto', # quantization_config=nf4_config, use_cache=False, ) model.to('cuda') @spaces.GPU def generate_response(prompt): prompt = f"Latin: {prompt}\nEnglish: " encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=True) model_inputs = encoded_input.to('cuda') generated_ids = model.generate(**model_inputs, max_new_tokens=100, min_new_tokens=1, do_sample=False, pad_token_id=tokenizer.eos_token_id) decoded_output = tokenizer.batch_decode(generated_ids) output = decoded_output[0].replace(prompt, "") output = re.sub("English: ", "", output) output = re.sub("<\|endoftext\|>", "", output) output = re.sub("\s+", " ", output) return output demo = gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Latin Input", max_length=400), outputs=gr.Textbox(label="English Output"), submit_btn="Translate", flagging_mode="never" ) demo.launch()