| import torch | |
| from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline | |
| from peft import PeftModel | |
| import re | |
| import gradio as gr | |
| tokenizer = LlamaTokenizer.from_pretrained('mostafaamiri/persian_llama_7b') | |
| base_model = LlamaForCausalLM.from_pretrained( | |
| "meta-llama/Llama-2-7b-hf", | |
| load_in_8bit=False, | |
| ) | |
| base_model.resize_token_embeddings(len(tokenizer)) | |
| model = PeftModel.from_pretrained( | |
| base_model, | |
| "mostafaamiri/persian_llama_7b",) | |
| prompt_input = ( | |
| "Below is an instruction that describes a task. " | |
| "Write a response that appropriately completes the request.\n\n" | |
| "### Instruction:\n\n{instruction}\n\n### Response:\n\n" | |
| ) | |
| def generate_prompt(instruction, input=None): | |
| if input: | |
| instruction = instruction + '\n' + input | |
| return prompt_input.format_map({'instruction': instruction}) | |
| config=dict( | |
| temperature=0.2, | |
| top_k=40, | |
| top_p=0.9, | |
| do_sample=True, | |
| num_beams=1, | |
| repetition_penalty=1.2, | |
| max_new_tokens=300 | |
| ) | |
| def launch_model(text): | |
| sample_data = [text] | |
| inputToken = tokenizer(generate_prompt(sample_data) , return_tensors="pt") | |
| outputs = model.generate(**inputToken, **config) | |
| output = tokenizer.decode(outputs[0],skip_special_tokens=True) | |
| output = re.sub(r"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n\n\[.*\]\n\n### Response:\n\n", "", output) | |
| return output | |
| iface = gr.Interface(fn=launch_model, inputs="text", outputs="text") | |
| iface.launch() |