ali1001 commited on
Commit
2f8cbf8
·
verified ·
1 Parent(s): 6ca13fa

Upload 12 files

Browse files
app.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from modules.pdf_loader import load_pdf
4
+ from modules.vectorstore import create_vectorstore
5
+ from modules.llm_model import load_llm_pipeline
6
+ from modules.qa_chain import create_qa_chain
7
+
8
+ # Set Hugging Face Token (if using)
9
+ os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets.get("HF_TOKEN", "")
10
+
11
+ st.set_page_config(page_title="Smart Business Report Assistant", layout="centered")
12
+ st.title("📊 Smart Business Report Assistant")
13
+
14
+ uploaded_files = st.file_uploader(
15
+ "📎 Upload one or more PDF reports/invoices",
16
+ type=["pdf"],
17
+ accept_multiple_files=True
18
+ )
19
+
20
+ if uploaded_files:
21
+ with st.spinner("🔄 Processing PDFs..."):
22
+ all_docs = []
23
+ for file in uploaded_files:
24
+ docs = load_pdf(file)
25
+ all_docs.extend(docs)
26
+
27
+ vectorstore = create_vectorstore(all_docs)
28
+ llm = load_llm_pipeline()
29
+ qa_chain = create_qa_chain(llm, vectorstore)
30
+
31
+ st.success("✅ Ready! Ask your questions below.")
32
+ query = st.text_input("❓ Ask a question about the uploaded PDF(s)")
33
+
34
+ if query:
35
+ with st.spinner("💬 Thinking..."):
36
+ try:
37
+ result = qa_chain.invoke({"query": query})
38
+ answer = result.get("result", "❌ No answer found. Try a different question.")
39
+ except Exception as e:
40
+ answer = f"⚠️ Error: {str(e)}"
41
+
42
+ st.markdown("### 💡 Answer")
43
+ st.markdown(
44
+ f"""
45
+ <div style='background-color: #1e1e1e; padding: 12px; border-radius: 8px; color: white; font-size: 16px;'>
46
+ {answer}
47
+ </div>
48
+ """,
49
+ unsafe_allow_html=True,
50
+ )
modules/__init__.py ADDED
File without changes
modules/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (171 Bytes). View file
 
modules/__pycache__/llm_model.cpython-313.pyc ADDED
Binary file (853 Bytes). View file
 
modules/__pycache__/pdf_loader.cpython-313.pyc ADDED
Binary file (1.07 kB). View file
 
modules/__pycache__/qa_chain.cpython-313.pyc ADDED
Binary file (1.1 kB). View file
 
modules/__pycache__/vectorstore.cpython-313.pyc ADDED
Binary file (567 Bytes). View file
 
modules/llm_model.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
2
+ from langchain.llms import HuggingFacePipeline
3
+
4
+ def load_llm_pipeline():
5
+ model_id = "declare-lab/flan-alpaca-large" # Better formatting
6
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
7
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
8
+ pipe = pipeline(
9
+ "text2text-generation",
10
+ model=model,
11
+ tokenizer=tokenizer,
12
+ max_length=512,
13
+ do_sample=True,
14
+ temperature=0.5,
15
+ )
16
+ return HuggingFacePipeline(pipeline=pipe)
modules/pdf_loader.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders import PyPDFLoader
2
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
3
+ import tempfile
4
+ import os
5
+
6
+ def load_pdf(uploaded_file):
7
+ # Save uploaded Streamlit file to a temporary location
8
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
9
+ tmp.write(uploaded_file.read())
10
+ tmp_path = tmp.name
11
+
12
+ try:
13
+ # Load and split PDF
14
+ loader = PyPDFLoader(tmp_path)
15
+ raw_pages = loader.load()
16
+
17
+ splitter = RecursiveCharacterTextSplitter(
18
+ chunk_size=1000, # ~200–300 tokens
19
+ chunk_overlap=200 # Keeps some context
20
+ )
21
+ return splitter.split_documents(raw_pages)
22
+
23
+ finally:
24
+ # Clean up temp file
25
+ if os.path.exists(tmp_path):
26
+ os.remove(tmp_path)
modules/qa_chain.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain.chains import RetrievalQA
2
+ from langchain.prompts import PromptTemplate
3
+
4
+ def create_qa_chain(llm, vectorstore):
5
+ retriever = vectorstore.as_retriever()
6
+
7
+ template = """
8
+ You are an AI assistant helping users analyze multiple PDFs (such as resumes, reports, invoices).
9
+ When answering questions, always speak from the user's perspective — say "your resume", not "my resume".
10
+
11
+ Be concise, polite, and answer in bullet points or short structured text.
12
+
13
+ Context:
14
+ {context}
15
+
16
+ Question:
17
+ {question}
18
+
19
+ Answer:
20
+ """
21
+
22
+ prompt = PromptTemplate(
23
+ input_variables=["context", "question"],
24
+ template=template,
25
+ )
26
+
27
+ return RetrievalQA.from_chain_type(
28
+ llm=llm,
29
+ retriever=retriever,
30
+ chain_type="stuff",
31
+ chain_type_kwargs={"prompt": prompt}
32
+ )
modules/vectorstore.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.vectorstores import FAISS
2
+ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
3
+
4
+ def create_vectorstore(pages):
5
+ embeddings = HuggingFaceInstructEmbeddings(
6
+ model_name="hkunlp/instructor-base"
7
+ )
8
+ return FAISS.from_documents(pages, embeddings)
requirements.txt CHANGED
@@ -1,3 +1,8 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
1
+ streamlit
2
+ langchain
3
+ langchain-community
4
+ faiss-cpu
5
+ transformers
6
+ huggingface-hub
7
+ pypdf
8
+ InstructorEmbedding