Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from PIL import Image | |
| import os | |
| import easyocr | |
| import numpy as np | |
| import fitz # PyMuPDF | |
| import io | |
| from pdf2image import convert_from_bytes | |
| #from st_btn_group import st_btn_group | |
| #from streamlit_option_menu import option_menu | |
| import docx | |
| from docx.shared import Pt | |
| from io import BytesIO | |
| #import streamlit.components.v1 as components | |
| import base64 | |
| #def downloadTxt(): | |
| def generateTxtLink(result): | |
| result_txt = "" | |
| print(result) | |
| for para in result: | |
| result_txt += para[1]+"\n" | |
| result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8') | |
| result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>" | |
| return result_txt_link | |
| def generateMultiPageTxtLink(result): | |
| result_txt = "" | |
| print(result) | |
| for para in result: | |
| result_txt += para+"\n" | |
| result_b64 = base64.b64encode(result_txt.encode()).decode('utf-8') | |
| result_txt_link = "<a class='button' href='data:text/plain;base64,"+result_b64+"' download='document.txt'>TXT</a>" | |
| return result_txt_link | |
| def generateDocLink(result): | |
| doc = docx.Document() | |
| for para in result: | |
| doc.add_paragraph(para[1]) | |
| target_stream = BytesIO() | |
| result_doc = doc.save(target_stream) | |
| base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8') | |
| stlyeCss = "" | |
| doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>" | |
| return doc_link | |
| def generateMultiPageDocLink(pages_result): | |
| doc = docx.Document() | |
| #print(pages_result) | |
| for page in pages_result: | |
| page_split = page.split("\n") | |
| for para in page_split: | |
| doc.add_paragraph(para) | |
| doc.add_page_break() | |
| target_stream = BytesIO() | |
| result_doc = doc.save(target_stream) | |
| base64_doc = base64.b64encode(target_stream.getvalue()).decode('utf-8') | |
| doc_link = "<a class='button' href='data:application/pdf;base64,"+base64_doc+"' download='document.docx'>DOCX</a>" | |
| return doc_link | |
| def generateButtonGroup(result): | |
| txtLink = generateTxtLink(result) | |
| docLink = generateDocLink(result) | |
| return txtLink+"\n"+docLink | |
| def generateButtonGroupForPDF(pages_result): | |
| #result = "\n\n".join(pages_result) | |
| txtLink = generateMultiPageTxtLink(pages_result) | |
| docLink = generateMultiPageDocLink(pages_result) | |
| return txtLink+"\n"+docLink | |
| def local_css(file_name): | |
| with open(file_name) as f: | |
| st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True) | |
| models_dir = "./models" | |
| output_dir = "./output" | |
| dirs = [models_dir, output_dir] | |
| for d in dirs: | |
| if not os.path.exists(output_dir): | |
| os.makedirs(output_dir) | |
| font_path = models_dir + "/Ubuntu-Regular.ttf" | |
| reader = easyocr.Reader( | |
| ['en'], | |
| gpu=True, | |
| recog_network='best_norm_ED', | |
| detect_network="craft", | |
| user_network_directory=models_dir, | |
| model_storage_directory=models_dir, | |
| ) # this needs to run only once to load the model into memory | |
| # main title | |
| st.set_page_config(layout="wide",page_title="Қазақша OCR, суреттегі текстті тану") | |
| local_css("app.css") | |
| #st.markdown("<a class='button' href='lenta.ru'>DOCX жүктеп ал</a>",unsafe_allow_html=True) | |
| st.title("Сурет немесе пдф файлдан текст алу") | |
| # subtitle | |
| #st.markdown("## Qazaq OCR") | |
| uploaded_file = st.file_uploader("Өз файлыңызды осында жүктеңіз ('png', 'jpeg', 'jpg', 'pdf')",help="aaa", type=['png', 'jpeg', 'jpg', 'pdf']) | |
| col1, col2 = st.columns(2) | |
| #def process_page(page): | |
| # image_matrix = fitz.Matrix(fitz.Identity) | |
| # pixmap = page.get_pixmap(matrix=image_matrix, dpi=300) | |
| # image_data = pixmap.samples# This is a bytes object | |
| # image = Image.from("RGB",(pixmap.width, pixmap.height),image_data) | |
| # image = Image.from("RGB", (pixmap.width, pixmap.height), image_data) | |
| # result = reader.readtext(np.array(image),paragraph=True) | |
| # return image, result | |
| import time | |
| max_page = 3 | |
| def recognize_page_image(image): | |
| start = time.time() | |
| result = [[0,"Sample 1"],[1,"Sample 2"]] | |
| result = reader.readtext(np.array(image), paragraph=True) | |
| end = time.time() | |
| return result,(end-start) | |
| def process_pdf(uploaded_file): | |
| pdf_document = fitz.open(temp_pdf_file) | |
| total_pages = len(pdf_document) | |
| progress_bar = col2.progress(0, text="Жүктеліп жатыр") | |
| button_group = col2.container() | |
| pages = range(min(max_page,total_pages)) | |
| tabs = col1.tabs([f"Бет {page+1}" for page in pages]) | |
| pages_result = [] | |
| for count, page_num in enumerate(range(min(total_pages,max_page))): | |
| page = pdf_document.load_page(page_num) | |
| image_matrix = fitz.Matrix(fitz.Identity) | |
| pixmap = page.get_pixmap(matrix=image_matrix, dpi=300) | |
| image_data = pixmap.samples # This is a bytes object | |
| image = Image.frombytes("RGB", (pixmap.width, pixmap.height), image_data) | |
| imageSmaller = image.resize((int(pixmap.width/10), int(pixmap.height/10))) | |
| tabs[count].image(imageSmaller) | |
| #buffered = BytesIO() | |
| #imageSmaller.save(buffered,format="JPEG") | |
| #col1.write(f'<h2>Бет {page_num + 1}/{total_pages}</h2>',unsafe_allow_html=True) | |
| #col1.write(f'<img src="data:image/png;base64, {base64.b64encode(buffered.getvalue()).decode("utf-8")}"/>',unsafe_allow_html=True) | |
| #col1.subheader(f'Бет {page_num + 1}/{total_pages}') | |
| #col1.image(imageSmaller, caption=f'Бет {page_num + 1}') | |
| result,time_elapsed = recognize_page_image(image) | |
| expander = col2.expander(f'{result[0][1][:100]} ... **:orange[{time_elapsed:.3f} секундта таңылды]**') | |
| expander.write(f'{result[0][1]}') | |
| result_text = "\n\n".join([item[1] for item in result]) | |
| pages_result.append(result_text) | |
| #col2.markdown(result_text) | |
| progress_bar.progress((count + 1) / min(total_pages,max_page),text=f'Жүктеліп жатыр {count+1}/{min(total_pages,max_page)}') | |
| button_group_html = generateButtonGroupForPDF(pages_result) | |
| button_group.write(button_group_html,unsafe_allow_html=True) | |
| #col1.write("</div>",unsafe_allow_html=True) | |
| progress_bar.progress(0.99,text=f'{min(total_pages,max_page)} бет жүктелді') | |
| if uploaded_file is not None: | |
| if uploaded_file.type == "application/pdf": | |
| placeholder = col2.empty() | |
| with placeholder, st.spinner('PDF өңделуде ...'): | |
| temp_pdf_file = "./temp_pdf_file.pdf" | |
| with open(temp_pdf_file, "wb") as f: | |
| f.write(uploaded_file.read()) | |
| process_pdf(uploaded_file) | |
| else: | |
| placeholder = col2.empty() | |
| with placeholder,st.spinner('Сурет өңделуде ...'): | |
| image = Image.open(uploaded_file) | |
| #with open(os.path.join("tempDir",image_file)) | |
| col1.image(image) | |
| result = reader.readtext(np.array(image), paragraph=True) | |
| result_text = "\n\n".join([item[1] for item in result]) | |
| button_group_html = generateButtonGroup(result) | |
| col2.write(button_group_html, unsafe_allow_html=True) | |
| col2.markdown(result_text) | |