Spaces:
Build error
Build error
| import streamlit as st | |
| from annotated_text import annotated_text | |
| import torch | |
| from transformers import pipeline | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| import spacy | |
| import json | |
| st.set_page_config(layout="wide") | |
| model = AutoModelForTokenClassification.from_pretrained("./models/lusa_prepo", use_safetensors=True) | |
| tokenizer = AutoTokenizer.from_pretrained("./models/lusa_prepo", model_max_length=512) | |
| tagger = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first') #aggregation_strategy='max' | |
| from spacy.matcher import PhraseMatcher | |
| nlp = spacy.load("en_core_web_sm") | |
| tokenization_contractions = { | |
| "no": ["n", "o"], | |
| "na": ["n", "a"], | |
| "nos": ["n", "os"], | |
| "nas": ["n", "as"], | |
| "ao": ["a", "o"], | |
| # "à": ["a", "a"], | |
| "aos": ["a", "os"], | |
| # "às": ["a", "as"], | |
| "do": ["d", "o"], | |
| "da": ["d", "a"], | |
| "dos": ["d", "os"], | |
| "das": ["d", "as"], | |
| "pelo": ["pel", "o"], | |
| "pela": ["pel", "a"], | |
| "pelos": ["pel", "os"], | |
| "pelas": ["pel", "as"], | |
| "dum": ["d", "um"], | |
| "duma": ["d", "uma"], | |
| "duns": ["d", "uns"], | |
| "dumas": ["d", "umas"], | |
| "num": ["n", "um"], | |
| "numa": ["n", "uma"], | |
| "nuns": ["n", "uns"], | |
| "numas": ["n", "umas"], | |
| "dele": ["d", "ele"], | |
| "dela": ["d", "ela"], | |
| "deles": ["d", "eles"], | |
| "delas": ["d", "elas"], | |
| "deste": ["d", "este"], | |
| "desta": ["d", "esta"], | |
| "destes": ["d", "estes"], | |
| "destas": ["d", "estas"], | |
| "desse": ["d", "esse"], | |
| "dessa": ["d", "essa"], | |
| "desses": ["d", "esses"], | |
| "dessas": ["d", "essas"], | |
| "daquele": ["d", "aquele"], | |
| "daquela": ["d", "aquela"], | |
| "daqueles": ["d", "aqueles"], | |
| "daquelas": ["d", "aquelas"], | |
| } | |
| def tokenize_contractions(doc, tokenization_contractions): | |
| words = tokenization_contractions.keys() # Example: words to be split | |
| splits = tokenization_contractions | |
| matcher = PhraseMatcher(nlp.vocab) | |
| patterns = [nlp.make_doc(text) for text in words] | |
| matcher.add("Terminology", None, *patterns) | |
| matches = matcher(doc) | |
| with doc.retokenize() as retokenizer: | |
| for match_id, start, end in matches: | |
| heads = [(doc[start],1), doc[start]] | |
| attrs = {"POS": ["ADP", "DET"], "DEP": ["pobj", "compound"]} | |
| orths= splits[doc[start:end].text] | |
| retokenizer.split(doc[start], orths=orths, heads=heads, attrs=attrs) | |
| return doc | |
| def aggregate_subwords(input_tokens, labels): | |
| new_inputs = [] | |
| new_labels = [] | |
| current_word = "" | |
| current_label = "" | |
| for i, token in enumerate(input_tokens): | |
| label = labels[i] | |
| # Handle subwords | |
| if token.startswith('##'): | |
| current_word += token[2:] | |
| else: | |
| # Finish previous word | |
| if current_word: | |
| new_inputs.append(current_word) | |
| new_labels.append(current_label) | |
| # Start new word | |
| current_word = token | |
| current_label = label | |
| new_inputs.append(current_word) | |
| new_labels.append(current_label) | |
| return new_inputs, new_labels | |
| def annotateTriggers(line): | |
| line = line.strip() | |
| doc = nlp(line) | |
| doc = tokenize_contractions(doc, tokenization_contractions) | |
| tokens = [token.text for token in doc] | |
| inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt") | |
| input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| predictions = torch.argmax(logits, dim=2) | |
| predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]] | |
| input_tokens, predicted_token_class = aggregate_subwords(input_tokens,predicted_token_class) | |
| input_tokens = input_tokens[1:-1] | |
| predicted_token_class = predicted_token_class[1:-1] | |
| print(input_tokens) | |
| print(predicted_token_class) | |
| print(len(input_tokens), len(predicted_token_class)) | |
| token_labels = [] | |
| current_entity = '' | |
| for i, label in enumerate(predicted_token_class): | |
| token = input_tokens[i] | |
| if label == 'O': | |
| token_labels.append((token, 'O', '')) | |
| current_entity = '' | |
| elif label.startswith('B-'): | |
| current_entity = label[2:] | |
| token_labels.append((token, 'B', current_entity)) | |
| elif label.startswith('I-'): | |
| if current_entity == '': | |
| #raise ValueError(f"Invalid label sequence: {predicted_token_class}") | |
| continue | |
| token_labels[-1] = (token_labels[-1][0] + f" {token}", 'I', current_entity) | |
| else: | |
| raise ValueError(f"Invalid label: {label}") | |
| return token_labels | |
| def joinEntities(entities): | |
| joined_entities = [] | |
| i = 0 | |
| while i < len(entities): | |
| curr_entity = entities[i] | |
| if curr_entity['entity'][0] == 'B': | |
| label = curr_entity['entity'][2:] | |
| j = i + 1 | |
| while j < len(entities) and entities[j]['entity'][0] == 'I': | |
| j += 1 | |
| joined_entity = { | |
| 'entity': label, | |
| 'score': max(e['score'] for e in entities[i:j]), | |
| 'index': min(e['index'] for e in entities[i:j]), | |
| 'word': ' '.join(e['word'] for e in entities[i:j]), | |
| 'start': entities[i]['start'], | |
| 'end': entities[j-1]['end'] | |
| } | |
| joined_entities.append(joined_entity) | |
| i = j - 1 | |
| i += 1 | |
| return joined_entities | |
| import pysbd | |
| seg = pysbd.Segmenter(language="es", clean=False) | |
| def sent_tokenize(text): | |
| return seg.segment(text) | |
| def getSentenceIndex(lines,span): | |
| i = 1 | |
| sum = len(lines[0]) | |
| while sum < span: | |
| sum += len(lines[i]) | |
| i = i + 1 | |
| return i - 1 | |
| def generateContext(text, window,span): | |
| lines = sent_tokenize(text) | |
| index = getSentenceIndex(lines,span) | |
| text = " ".join(lines[max(0,index-window):index+window +1]) | |
| return text | |
| def annotateEvents(text,squad,window): | |
| text = text.strip() | |
| ner_results = tagger(text) | |
| #print(ner_results) | |
| #ner_results = joinEntities(ner_results) | |
| i = 0 | |
| #exit() | |
| while i < len(ner_results): | |
| ner_results[i]["entity"] = ner_results[i]["entity_group"].lstrip("B-") | |
| ner_results[i]["entity"] = ner_results[i]["entity_group"].lstrip("I-") | |
| i = i + 1 | |
| events = [] | |
| for trigger in ner_results: | |
| tipo = trigger["entity_group"] | |
| context = generateContext(text,window,trigger["start"]) | |
| event = { | |
| "trigger":trigger["word"], | |
| "type": tipo, | |
| "score": trigger["score"], | |
| "context": context, | |
| } | |
| events.append(event) | |
| return events | |
| #"A Joana foi atacada pelo João nas ruas do Porto, com uma faca." | |
| st.title('Identify Events') | |
| options = ["Naquele ano o rei morreu na batalha em Almograve. A rainha casou com o irmão dele.","O presidente da Federação Haitiana de Futebol, Yves Jean-Bart, foi banido para sempre de toda a atividade ligada ao futebol, por ter sido considerado culpado de abuso sexual sistemático de jogadoras, anunciou hoje a FIFA.", | |
| "O navio 'Figaro', no qual viajavam 30 tripulantes - 16 angolanos, cinco espanhóis, cinco senegaleses, três peruanos e um do Gana - acionou por telefone o alarme de incêndio a bordo.", "A Polícia Judiciária (PJ) está a investigar o aparecimento de ossadas que foram hoje avistadas pelo proprietário de um terreno na freguesia de Meadela, em Viana do Castelo, disse à Lusa fonte daquela força policial."] | |
| option = st.selectbox( | |
| 'Select examples', | |
| options) | |
| #option = options [index] | |
| line = st.text_area("Insert Text",option) | |
| st.button('Run') | |
| window = 1 | |
| if line != "": | |
| st.header("Triggers:") | |
| triggerss = annotateTriggers(line) | |
| annotated_text(*[word[0]+" " if word[1] == 'O' else (word[0]+" ",word[2]) for word in triggerss ]) | |
| eventos_1 = annotateEvents(line,1,window) | |
| eventos_2 = annotateEvents(line,2,window) | |
| for mention1, mention2 in zip(eventos_1,eventos_2): | |
| st.text(f"| Trigger: {mention1['trigger']:20} | Type: {mention1['type']:10} | Score: {str(round(mention1['score'],3)):5} |") | |
| st.markdown("""---""") | |