Spaces:

kn29
/

rag-chat

Sleeping

App Files Files Community

kn29 commited on Sep 26

Commit

fef6ed9

verified ·

1 Parent(s): d5bcbe3

Update rag.py

Browse files

Files changed (1) hide show

rag.py +159 -497

rag.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModel
@@ -13,14 +14,22 @@ import networkx as nx
 from collections import defaultdict
 import spacy
 from rank_bm25 import BM25Okapi
 # Global model instances (shared across sessions)
 _SHARED_MODEL = None
 _SHARED_TOKENIZER = None
 _SHARED_NLP_MODEL = None
 _DEVICE = None
-# Legal knowledge base (shared constants)
 LEGAL_CONCEPTS = {
     'liability': ['negligence', 'strict liability', 'vicarious liability', 'product liability'],
     'contract': ['breach', 'consideration', 'offer', 'acceptance', 'damages', 'specific performance'],
@@ -39,7 +48,7 @@ QUERY_PATTERNS = {
 def initialize_models(model_id: str, groq_api_key: str = None):
     """Initialize shared models (call once at startup)"""
-    global _SHARED_MODEL, _SHARED_TOKENIZER, _SHARED_NLP_MODEL, _DEVICE
     try:
         nltk.download('punkt', quiet=True)
@@ -48,21 +57,24 @@ def initialize_models(model_id: str, groq_api_key: str = None):
         pass
     _DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(f"Using device: {_DEVICE}")
-    print(f"Loading model: {model_id}")
     _SHARED_TOKENIZER = AutoTokenizer.from_pretrained(model_id)
     _SHARED_MODEL = AutoModel.from_pretrained(model_id).to(_DEVICE)
     _SHARED_MODEL.eval()
     try:
         _SHARED_NLP_MODEL = spacy.load("en_core_web_sm")
     except:
-        print("SpaCy model not found, using basic NER")
         _SHARED_NLP_MODEL = None
-class SessionRAG:
-    """Session-specific RAG instance"""
     def __init__(self, session_id: str, groq_api_key: str = None):
         self.session_id = session_id
@@ -71,495 +83,209 @@ class SessionRAG:
         # Session-specific indices and data
         self.dense_index = None
         self.bm25_index = None
-        self.concept_graph = None
         self.token_to_chunks = None
         self.chunks_data = []
         # Verify shared models are initialized
         if _SHARED_MODEL is None or _SHARED_TOKENIZER is None:
             raise ValueError("Models not initialized. Call initialize_models() first.")
-    def create_embedding(self, text: str) -> np.ndarray:
-        """Create dense embedding for text"""
-        inputs = _SHARED_TOKENIZER(text, padding=True, truncation=True,
-                          max_length=512, return_tensors='pt').to(_DEVICE)
-        with torch.no_grad():
-            outputs = _SHARED_MODEL(**inputs)
-            attention_mask = inputs['attention_mask']
-            token_embeddings = outputs.last_hidden_state
-            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-            embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-            # Normalize embeddings
-            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-        return embeddings.cpu().numpy()[0]
     def load_existing_session_data(self, chunks_from_db: List[Dict[str, Any]]):
-        """Load pre-existing chunks with embeddings from database"""
-        print(f"Loading existing session data for {self.session_id}: {len(chunks_from_db)} chunks...")
-        # Process chunks from MongoDB format
-        self.chunks_data = self.process_db_chunks(chunks_from_db)
-        # Rebuild indices from existing embeddings (don't recreate embeddings)
-        self.rebuild_indices_from_existing_embeddings()
-        print(f"Session {self.session_id} loaded with existing embeddings!")
-    def rebuild_indices_from_existing_embeddings(self):
-        """Rebuild search indices using existing embeddings from database"""
         if not self.chunks_data:
             raise ValueError("No chunks data available")
-        print(f"Rebuilding indices from existing embeddings...")
-        # Extract existing embeddings
         embeddings = []
         for chunk in self.chunks_data:
-            if 'embedding' in chunk and chunk['embedding'] is not None:
-                embeddings.append(chunk['embedding'])
-            else:
                 raise ValueError(f"Missing embedding for chunk {chunk.get('id', 'unknown')}")
-        # Build FAISS index from existing embeddings
-        embeddings_matrix = np.vstack(embeddings)
         self.dense_index = faiss.IndexFlatIP(embeddings_matrix.shape[1])
-        self.dense_index.add(embeddings_matrix.astype('float32'))
-        # Build other indices
         tokenized_corpus = [chunk['text'].lower().split() for chunk in self.chunks_data]
         self.bm25_index = BM25Okapi(tokenized_corpus)
-        # 3. ColBERT-style token index
         self.token_to_chunks = defaultdict(set)
         for i, chunk in enumerate(self.chunks_data):
             tokens = chunk['text'].lower().split()
             for token in tokens:
                 self.token_to_chunks[token].add(i)
-        # 4. Legal concept graph
-        self.concept_graph = nx.Graph()
-        for i, chunk in enumerate(self.chunks_data):
-            self.concept_graph.add_node(i, text=chunk['text'][:200], importance=chunk['importance_score'])
-            for j, other_chunk in enumerate(self.chunks_data[i+1:], i+1):
-                shared_entities = set(e['text'] for e in chunk['entities']) & \
-                                set(e['text'] for e in other_chunk['entities'])
-                if shared_entities:
-                    self.concept_graph.add_edge(i, j, weight=len(shared_entities))
-        print(f"All indices rebuilt from existing embeddings for session {self.session_id}!")
-    def process_db_chunks(self, chunks_from_db: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-        """Convert MongoDB chunk format to internal format"""
-        processed_chunks = []
-        for chunk in chunks_from_db:
-            # Convert embedding from list to numpy array if needed
-            embedding = chunk.get('embedding')
-            if embedding and isinstance(embedding, list):
-                embedding = np.array(embedding)
-            processed_chunk = {
-                'id': chunk.get('chunk_id', chunk.get('id')),
-                'text': chunk.get('content', chunk.get('text', '')),
-                'title': chunk.get('title', 'Document'),
-                'section_type': chunk.get('section_type', 'general'),
-                'importance_score': chunk.get('importance_score', 1.0),
-                'entities': chunk.get('entities', []),
-                'embedding': embedding
-            }
-            processed_chunks.append(processed_chunk)
-        return processed_chunks
-    def extract_legal_entities(self, text: str) -> List[Dict[str, Any]]:
-        """Extract legal entities from text"""
-        entities = []
-        if _SHARED_NLP_MODEL:
-            doc = _SHARED_NLP_MODEL(text[:5000])  # Limit for performance
-            for ent in doc.ents:
-                if ent.label_ in ['PERSON', 'ORG', 'LAW', 'GPE']:
-                    entities.append({
-                        'text': ent.text,
-                        'type': ent.label_,
-                        'importance': 1.0
-                    })
-        # Legal citations
-        citation_pattern = r'\b\d+\s+[A-Z][a-z]+\.?\s+\d+\b'
-        for match in re.finditer(citation_pattern, text):
-            entities.append({
-                'text': match.group(),
-                'type': 'case_citation',
-                'importance': 2.0
-            })
-        # Statute references
-        statute_pattern = r'§\s*\d+[\.\d]*|\bSection\s+\d+'
-        for match in re.finditer(statute_pattern, text):
-            entities.append({
-                'text': match.group(),
-                'type': 'statute',
-                'importance': 1.5
-            })
-        return entities
-    def analyze_query(self, query: str) -> Dict[str, Any]:
-        """Analyze query to understand intent"""
         query_lower = query.lower()
-        # Classify query type
         query_type = 'general'
         for qtype, patterns in QUERY_PATTERNS.items():
             if any(pattern in query_lower for pattern in patterns):
                 query_type = qtype
                 break
-        # Extract entities
-        entities = self.extract_legal_entities(query)
-        # Extract key concepts
         key_concepts = []
         for concept_category, concepts in LEGAL_CONCEPTS.items():
             for concept in concepts:
                 if concept in query_lower:
                     key_concepts.append(concept)
-        # Generate expanded queries
         expanded_queries = [query]
-        # Concept expansion
         if key_concepts:
-            expanded_queries.append(f"{query} {' '.join(key_concepts[:3])}")
-        # Type-based expansion
-        if query_type == 'precedent':
-            expanded_queries.append(f"legal precedent case law {query}")
-        elif query_type == 'statute_interpretation':
-            expanded_queries.append(f"statutory interpretation meaning {query}")
-        # HyDE - Hypothetical document generation
-        if self.groq_client:
-            hyde_doc = self.generate_hypothetical_document(query)
-            if hyde_doc:
-                expanded_queries.append(hyde_doc)
         return {
             'original_query': query,
             'query_type': query_type,
-            'entities': entities,
             'key_concepts': key_concepts,
-            'expanded_queries': expanded_queries[:4]  # Limit to 4 queries
         }
-    def generate_hypothetical_document(self, query: str) -> Optional[str]:
-        """Generate hypothetical answer document (HyDE technique)"""
-        if not self.groq_client:
-            return None
-        try:
-            prompt = f"""Generate a brief hypothetical legal document excerpt that would answer this question: {query}
-            Write it as if it's from an actual legal case or statute. Be specific and use legal language.
-            Keep it under 100 words."""
-            response = self.groq_client.chat.completions.create(
-                messages=[
-                    {"role": "system", "content": "You are a legal expert generating hypothetical legal text."},
-                    {"role": "user", "content": prompt}
-                ],
-                model="llama-3.1-8b-instant",
-                temperature=0.3,
-                max_tokens=150
-            )
-            return response.choices[0].message.content
-        except:
-            return None
-    def chunk_text_hierarchical(self, text: str, title: str = "") -> List[Dict[str, Any]]:
-        """Create hierarchical chunks with legal structure awareness"""
-        chunks = []
-        # Clean text
-        text = re.sub(r'\s+', ' ', text)
-        # Identify legal sections
-        section_patterns = [
-            (r'(?i)\bFACTS?\b[:\s]', 'facts'),
-            (r'(?i)\bHOLDING\b[:\s]', 'holding'),
-            (r'(?i)\bREASONING\b[:\s]', 'reasoning'),
-            (r'(?i)\bDISSENT\b[:\s]', 'dissent'),
-            (r'(?i)\bCONCLUSION\b[:\s]', 'conclusion')
-        ]
-        sections = []
-        for pattern, section_type in section_patterns:
-            matches = list(re.finditer(pattern, text))
-            for match in matches:
-                sections.append((match.start(), section_type))
-        sections.sort(key=lambda x: x[0])
-        # Split into sentences
-        import nltk
-        try:
-            sentences = nltk.sent_tokenize(text)
-        except:
-            sentences = text.split('. ')
-        # Create chunks
-        current_section = 'introduction'
-        section_sentences = []
-        chunk_size = 500  # words
-        for sent in sentences:
-            # Check section type
-            sent_pos = text.find(sent)
-            for pos, stype in sections:
-                if sent_pos >= pos:
-                    current_section = stype
-            section_sentences.append(sent)
-            # Create chunk when we have enough content
-            chunk_text = ' '.join(section_sentences)
-            if len(chunk_text.split()) >= chunk_size or len(section_sentences) >= 10:
-                chunk_id = hashlib.md5(f"{title}_{len(chunks)}_{chunk_text[:50]}".encode()).hexdigest()[:12]
-                # Calculate importance
-                importance = 1.0
-                section_weights = {
-                    'holding': 2.0, 'conclusion': 1.8, 'reasoning': 1.5,
-                    'facts': 1.2, 'dissent': 0.8
-                }
-                importance *= section_weights.get(current_section, 1.0)
-                # Entity importance
-                entities = self.extract_legal_entities(chunk_text)
-                if entities:
-                    entity_score = sum(e['importance'] for e in entities) / len(entities)
-                    importance *= (1 + entity_score * 0.5)
-                chunks.append({
-                    'id': chunk_id,
-                    'text': chunk_text,
-                    'title': title,
-                    'section_type': current_section,
-                    'importance_score': importance,
-                    'entities': entities,
-                    'embedding': None  # Will be filled during indexing
-                })
-                section_sentences = []
-        # Add remaining sentences
-        if section_sentences:
-            chunk_text = ' '.join(section_sentences)
-            chunk_id = hashlib.md5(f"{title}_{len(chunks)}_{chunk_text[:50]}".encode()).hexdigest()[:12]
-            chunks.append({
-                'id': chunk_id,
-                'text': chunk_text,
-                'title': title,
-                'section_type': current_section,
-                'importance_score': 1.0,
-                'entities': self.extract_legal_entities(chunk_text),
-                'embedding': None
-            })
-        return chunks
-    def build_all_indices(self, chunks: List[Dict[str, Any]]):
-        """Build all retrieval indices for this session"""
-        self.chunks_data = chunks
-        print(f"Building indices for session {self.session_id}: {len(chunks)} chunks...")
-        # 1. Dense embeddings + FAISS index
-        print("Building FAISS index...")
-        embeddings = []
-        for chunk in tqdm(chunks, desc="Creating embeddings"):
-            embedding = self.create_embedding(chunk['text'])
-            chunk['embedding'] = embedding
-            embeddings.append(embedding)
-        embeddings_matrix = np.vstack(embeddings)
-        self.dense_index = faiss.IndexFlatIP(embeddings_matrix.shape[1])  # Inner product for normalized vectors
-        self.dense_index.add(embeddings_matrix.astype('float32'))
-        # 2. BM25 index for sparse retrieval
-        print("Building BM25 index...")
-        tokenized_corpus = [chunk['text'].lower().split() for chunk in chunks]
-        self.bm25_index = BM25Okapi(tokenized_corpus)
-        # 3. ColBERT-style token index
-        print("Building ColBERT token index...")
-        self.token_to_chunks = defaultdict(set)
-        for i, chunk in enumerate(chunks):
-            # Simple tokenization for token-level matching
-            tokens = chunk['text'].lower().split()
-            for token in tokens:
-                self.token_to_chunks[token].add(i)
-        # 4. Legal concept graph
-        print("Building legal concept graph...")
-        self.concept_graph = nx.Graph()
-        for i, chunk in enumerate(chunks):
-            self.concept_graph.add_node(i, text=chunk['text'][:200], importance=chunk['importance_score'])
-            # Add edges between chunks with shared entities
-            for j, other_chunk in enumerate(chunks[i+1:], i+1):
-                shared_entities = set(e['text'] for e in chunk['entities']) & \
-                                set(e['text'] for e in other_chunk['entities'])
-                if shared_entities:
-                    self.concept_graph.add_edge(i, j, weight=len(shared_entities))
-        print(f"All indices built successfully for session {self.session_id}!")
-    def multi_stage_retrieval(self, query_analysis: Dict[str, Any], top_k: int = 10) -> List[Tuple[Dict[str, Any], float]]:
-        """Perform multi-stage retrieval combining all techniques"""
         candidates = {}
-        print(f"Performing multi-stage retrieval for session {self.session_id}...")
-        # Stage 1: Dense retrieval with expanded queries
-        print("Stage 1: Dense retrieval...")
-        for query in query_analysis['expanded_queries'][:3]:
-            query_emb = self.create_embedding(query)
-            scores, indices = self.dense_index.search(
-                query_emb.reshape(1, -1).astype('float32'),
-                top_k * 2
-            )
-            for idx, score in zip(indices[0], scores[0]):
                 if idx < len(self.chunks_data):
-                    chunk_id = self.chunks_data[idx]['id']
-                    if chunk_id not in candidates:
-                        candidates[chunk_id] = {'chunk': self.chunks_data[idx], 'scores': {}}
-                    candidates[chunk_id]['scores']['dense'] = float(score)
-        # Stage 2: Sparse retrieval (BM25)
-        print("Stage 2: Sparse retrieval...")
-        query_tokens = query_analysis['original_query'].lower().split()
-        bm25_scores = self.bm25_index.get_scores(query_tokens)
-        top_bm25_indices = np.argsort(bm25_scores)[-top_k*2:][::-1]
-        for idx in top_bm25_indices:
-            if idx < len(self.chunks_data):
-                chunk_id = self.chunks_data[idx]['id']
-                if chunk_id not in candidates:
-                    candidates[chunk_id] = {'chunk': self.chunks_data[idx], 'scores': {}}
-                candidates[chunk_id]['scores']['bm25'] = float(bm25_scores[idx])
-        # Stage 3: Entity-based retrieval
-        print("Stage 3: Entity-based retrieval...")
-        for entity in query_analysis['entities']:
-            for chunk in self.chunks_data:
-                chunk_entity_texts = [e['text'].lower() for e in chunk['entities']]
-                if entity['text'].lower() in chunk_entity_texts:
                     chunk_id = chunk['id']
                     if chunk_id not in candidates:
-                        candidates[chunk_id] = {'chunk': chunk, 'scores': {}}
-                    candidates[chunk_id]['scores']['entity'] = \
-                        candidates[chunk_id]['scores'].get('entity', 0) + entity['importance']
-        # Stage 4: Graph-based retrieval
-        print("Stage 4: Graph-based retrieval...")
-        if candidates and self.concept_graph:
-            seed_chunks = []
-            for chunk_id, data in list(candidates.items())[:5]:
-                for i, chunk in enumerate(self.chunks_data):
-                    if chunk['id'] == chunk_id:
-                        seed_chunks.append(i)
-                        break
-            for seed_idx in seed_chunks:
-                if seed_idx in self.concept_graph:
-                    neighbors = list(self.concept_graph.neighbors(seed_idx))[:3]
-                    for neighbor_idx in neighbors:
-                        if neighbor_idx < len(self.chunks_data):
-                            chunk = self.chunks_data[neighbor_idx]
-                            chunk_id = chunk['id']
-                            if chunk_id not in candidates:
-                                candidates[chunk_id] = {'chunk': chunk, 'scores': {}}
-                                candidates[chunk_id]['scores']['graph'] = 0.5
-        # Combine scores
-        print("Combining scores...")
-        weights = {'dense': 0.35, 'bm25': 0.25, 'entity': 0.25, 'graph': 0.15}
-        final_scores = []
-        for chunk_id, data in candidates.items():
-            chunk = data['chunk']
-            scores = data['scores']
-            final_score = 0
-            for method, weight in weights.items():
-                if method in scores:
-                    # Normalize scores
-                    if method == 'dense':
-                        normalized = (scores[method] + 1) / 2  # [-1, 1] to [0, 1]
-                    elif method == 'bm25':
-                        normalized = min(scores[method] / 10, 1)
-                    elif method == 'entity':
-                        normalized = min(scores[method] / 3, 1)
                     else:
-                        normalized = scores[method]
-                    final_score += weight * normalized
-            # Boost by importance and section relevance
-            final_score *= chunk['importance_score']
-            if query_analysis['query_type'] == 'precedent' and chunk['section_type'] == 'holding':
-                final_score *= 1.5
-            elif query_analysis['query_type'] == 'factual' and chunk['section_type'] == 'facts':
-                final_score *= 1.5
-            final_scores.append((chunk, final_score))
-        # Sort and return top-k
         final_scores.sort(key=lambda x: x[1], reverse=True)
         return final_scores[:top_k]
-    def generate_answer_with_reasoning(self, query: str, retrieved_chunks: List[Tuple[Dict[str, Any], float]]) -> Dict[str, Any]:
-        """Generate answer with legal reasoning"""
         if not self.groq_client:
             return {'error': 'Groq client not initialized'}
-        # Prepare context
         context_parts = []
-        for i, (chunk, score) in enumerate(retrieved_chunks, 1):
-            entities = ', '.join([e['text'] for e in chunk['entities'][:3]])
             context_parts.append(f"""
-    Document {i} [{chunk['title']}] - Relevance: {score:.2f}
-    Section: {chunk['section_type']}
-    Key Entities: {entities}
-    Content: {chunk['text'][:800]}
-    """)
         context = "\n---\n".join(context_parts)
-        system_prompt = """You are an expert legal analyst. Provide thorough legal analysis using the IRAC method:
-    1. ISSUE: Identify the legal issue(s)
-    2. RULE: State the applicable legal rules/precedents
-    3. APPLICATION: Apply the rules to the facts
-    4. CONCLUSION: Provide a clear conclusion
-    CRITICAL: Base ALL responses on the provided document excerpts only. Quote directly when making claims.
-    If information is not in the excerpts, state "This information is not provided in the available documents."
-    """
         user_prompt = f"""Query: {query}
-    Retrieved Legal Documents:
-    {context}
-    Please provide a comprehensive legal analysis using IRAC method. Cite the documents when making claims."""
         try:
             response = self.groq_client.chat.completions.create(
@@ -569,7 +295,7 @@ class SessionRAG:
                 ],
                 model="llama-3.1-8b-instant",
                 temperature=0.1,
-                max_tokens=1000
             )
             answer = response.choices[0].message.content
@@ -587,45 +313,28 @@ class SessionRAG:
                         'title': chunk['title'],
                         'section': chunk['section_type'],
                         'relevance_score': float(score),
-                        'excerpt': chunk['text'][:200] + '...',
-                        'entities': [e['text'] for e in chunk['entities'][:5]]
                     }
-                    for chunk, score in retrieved_chunks
                 ]
             }
         except Exception as e:
-            return {
-                'error': f'Error generating answer: {str(e)}',
-                'sources': [{'chunk': c['text'][:200], 'score': s} for c, s in retrieved_chunks[:3]]
-            }
-    def process_documents(self, documents: List[Dict[str, str]]) -> Dict[str, Any]:
-        """Process documents and build indices for this session"""
-        all_chunks = []
-        for doc in documents:
-            chunks = self.chunk_text_hierarchical(doc['text'], doc.get('title', 'Document'))
-            all_chunks.extend(chunks)
-        self.build_all_indices(all_chunks)
-        return {
-            'success': True,
-            'chunk_count': len(all_chunks),
-            'message': f'Processed {len(documents)} documents into {len(all_chunks)} chunks for session {self.session_id}'
-        }
     def query_documents(self, query: str, top_k: int = 5) -> Dict[str, Any]:
-        """Main query function - takes query, returns answer with sources"""
         if not self.chunks_data:
-            return {'error': f'No documents indexed for session {self.session_id}. Call process_documents first.'}
-        # Analyze query
-        query_analysis = self.analyze_query(query)
-        # Multi-stage retrieval
-        retrieved_chunks = self.multi_stage_retrieval(query_analysis, top_k)
         if not retrieved_chunks:
             return {
@@ -634,59 +343,12 @@ class SessionRAG:
             }
         # Generate answer
-        result = self.generate_answer_with_reasoning(query, retrieved_chunks)
         result['query_analysis'] = query_analysis
         return result
-    def search_chunks_simple(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
-        """Simple search function for compatibility"""
-        if not self.chunks_data:
-            return []
-        query_analysis = self.analyze_query(query)
-        retrieved_chunks = self.multi_stage_retrieval(query_analysis, top_k)
-        results = []
-        for chunk, score in retrieved_chunks:
-            results.append({
-                'chunk': {
-                    'id': chunk['id'],
-                    'text': chunk['text'],
-                    'title': chunk['title']
-                },
-                'score': score
-            })
-        return results
-    def generate_conservative_answer(self, query: str, context_chunks: List[Dict[str, Any]]) -> str:
-        """Generate conservative answer - for compatibility"""
-        if not context_chunks:
-            return "No relevant information found."
-        # Convert format
-        retrieved_chunks = [(chunk['chunk'], chunk['score']) for chunk in context_chunks]
-        result = self.generate_answer_with_reasoning(query, retrieved_chunks)
-        if 'error' in result:
-            return result['error']
-        return result.get('answer', 'Unable to generate answer.')
-# Backward compatibility functions (deprecated - use SessionRAG instead)
-def process_documents(documents: List[Dict[str, str]]) -> Dict[str, Any]:
-    """Deprecated: Use SessionRAG.process_documents() instead"""
-    raise NotImplementedError("Global functions are deprecated. Use SessionRAG class instead.")
-def query_documents(query: str, top_k: int = 5) -> Dict[str, Any]:
-    """Deprecated: Use SessionRAG.query_documents() instead"""
-    raise NotImplementedError("Global functions are deprecated. Use SessionRAG class instead.")
-def search_chunks_simple(query: str, top_k: int = 3) -> List[Dict[str, Any]]:
-    """Deprecated: Use SessionRAG.search_chunks_simple() instead"""
-    raise NotImplementedError("Global functions are deprecated. Use SessionRAG class instead.")
-def generate_conservative_answer(query: str, context_chunks: List[Dict[str, Any]]) -> str:
-    """Deprecated: Use SessionRAG.generate_conservative_answer() instead"""
-    raise NotImplementedError("Global functions are deprecated. Use SessionRAG class instead.")

+# rag_optimized.py - Performance-Optimized RAG System
 import torch
 import numpy as np
 from transformers import AutoTokenizer, AutoModel
 from collections import defaultdict
 import spacy
 from rank_bm25 import BM25Okapi
+import asyncio
+import time
+from concurrent.futures import ThreadPoolExecutor
+import logging
+# Configure logging
+logger = logging.getLogger(__name__)
 # Global model instances (shared across sessions)
 _SHARED_MODEL = None
 _SHARED_TOKENIZER = None
 _SHARED_NLP_MODEL = None
 _DEVICE = None
+_THREAD_POOL = None
+# Legal knowledge base (optimized)
 LEGAL_CONCEPTS = {
     'liability': ['negligence', 'strict liability', 'vicarious liability', 'product liability'],
     'contract': ['breach', 'consideration', 'offer', 'acceptance', 'damages', 'specific performance'],
 def initialize_models(model_id: str, groq_api_key: str = None):
     """Initialize shared models (call once at startup)"""
+    global _SHARED_MODEL, _SHARED_TOKENIZER, _SHARED_NLP_MODEL, _DEVICE, _THREAD_POOL
     try:
         nltk.download('punkt', quiet=True)
         pass
     _DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    logger.info(f"Using device: {_DEVICE}")
+    logger.info(f"Loading model: {model_id}")
     _SHARED_TOKENIZER = AutoTokenizer.from_pretrained(model_id)
     _SHARED_MODEL = AutoModel.from_pretrained(model_id).to(_DEVICE)
     _SHARED_MODEL.eval()
+    # Initialize thread pool for CPU-bound operations
+    _THREAD_POOL = ThreadPoolExecutor(max_workers=4)
     try:
         _SHARED_NLP_MODEL = spacy.load("en_core_web_sm")
     except:
+        logger.warning("SpaCy model not found, using basic NER")
         _SHARED_NLP_MODEL = None
+class OptimizedSessionRAG:
+    """High-performance session-specific RAG instance that loads pre-computed embeddings"""
     def __init__(self, session_id: str, groq_api_key: str = None):
         self.session_id = session_id
         # Session-specific indices and data
         self.dense_index = None
         self.bm25_index = None
         self.token_to_chunks = None
         self.chunks_data = []
+        # Performance tracking
+        self.load_time = None
+        self.index_build_time = None
         # Verify shared models are initialized
         if _SHARED_MODEL is None or _SHARED_TOKENIZER is None:
             raise ValueError("Models not initialized. Call initialize_models() first.")
     def load_existing_session_data(self, chunks_from_db: List[Dict[str, Any]]):
+        """OPTIMIZED: Load pre-existing chunks with embeddings from database - NO EMBEDDING CREATION"""
+        start_time = time.time()
+        logger.info(f"Loading existing session data for {self.session_id}: {len(chunks_from_db)} chunks...")
+        # Process chunks from MongoDB format - DIRECT LOADING, NO EMBEDDING COMPUTATION
+        self.chunks_data = self._process_db_chunks_fast(chunks_from_db)
+        # Rebuild indices from existing embeddings ONLY
+        self._rebuild_indices_from_precomputed_embeddings()
+        self.load_time = time.time() - start_time
+        logger.info(f"Session {self.session_id} loaded in {self.load_time:.2f}s with PRE-COMPUTED embeddings!")
+    def _process_db_chunks_fast(self, chunks_from_db: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """FAST: Convert MongoDB chunk format to internal format without any computation"""
+        processed_chunks = []
+        for chunk in chunks_from_db:
+            # Convert embedding from list to numpy array if needed - NO COMPUTATION
+            embedding = chunk.get('embedding')
+            if embedding is None:
+                raise ValueError(f"Missing embedding for chunk {chunk.get('chunk_id', 'unknown')}")
+            if isinstance(embedding, list):
+                embedding = np.array(embedding, dtype=np.float32)
+            processed_chunk = {
+                'id': chunk.get('chunk_id', chunk.get('id')),
+                'text': chunk.get('content', chunk.get('text', '')),
+                'title': chunk.get('title', 'Document'),
+                'section_type': chunk.get('section_type', 'general'),
+                'importance_score': chunk.get('importance_score', 1.0),
+                'entities': chunk.get('entities', []),
+                'embedding': embedding  # PRE-COMPUTED, NO CREATION
+            }
+            processed_chunks.append(processed_chunk)
+        return processed_chunks
+    def _rebuild_indices_from_precomputed_embeddings(self):
+        """OPTIMIZED: Rebuild search indices using ONLY pre-computed embeddings from database"""
         if not self.chunks_data:
             raise ValueError("No chunks data available")
+        start_time = time.time()
+        logger.info(f"Rebuilding indices from {len(self.chunks_data)} pre-computed embeddings...")
+        # 1. Build FAISS index from existing embeddings - NO EMBEDDING COMPUTATION
         embeddings = []
         for chunk in self.chunks_data:
+            if chunk['embedding'] is None:
                 raise ValueError(f"Missing embedding for chunk {chunk.get('id', 'unknown')}")
+            embeddings.append(chunk['embedding'])
+        # Stack embeddings efficiently
+        embeddings_matrix = np.vstack(embeddings).astype('float32')
+        logger.info(f"Built embeddings matrix: {embeddings_matrix.shape}")
+        # Build FAISS index
         self.dense_index = faiss.IndexFlatIP(embeddings_matrix.shape[1])
+        self.dense_index.add(embeddings_matrix)
+        # 2. Build BM25 index efficiently
         tokenized_corpus = [chunk['text'].lower().split() for chunk in self.chunks_data]
         self.bm25_index = BM25Okapi(tokenized_corpus)
+        # 3. Build token-to-chunk mapping efficiently
         self.token_to_chunks = defaultdict(set)
         for i, chunk in enumerate(self.chunks_data):
             tokens = chunk['text'].lower().split()
             for token in tokens:
                 self.token_to_chunks[token].add(i)
+        self.index_build_time = time.time() - start_time
+        logger.info(f"All indices rebuilt in {self.index_build_time:.2f}s from pre-computed embeddings!")
+    def create_embedding(self, text: str) -> np.ndarray:
+        """Create embedding for query (ONLY used for new queries, not document loading)"""
+        inputs = _SHARED_TOKENIZER(text, padding=True, truncation=True,
+                          max_length=512, return_tensors='pt').to(_DEVICE)
+        with torch.no_grad():
+            outputs = _SHARED_MODEL(**inputs)
+            attention_mask = inputs['attention_mask']
+            token_embeddings = outputs.last_hidden_state
+            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+            embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+            # Normalize embeddings
+            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+        return embeddings.cpu().numpy()[0].astype('float32')
+    def analyze_query_fast(self, query: str) -> Dict[str, Any]:
+        """FAST query analysis - minimal processing"""
         query_lower = query.lower()
+        # Quick query type classification
         query_type = 'general'
         for qtype, patterns in QUERY_PATTERNS.items():
             if any(pattern in query_lower for pattern in patterns):
                 query_type = qtype
                 break
+        # Extract key concepts quickly
         key_concepts = []
         for concept_category, concepts in LEGAL_CONCEPTS.items():
             for concept in concepts:
                 if concept in query_lower:
                     key_concepts.append(concept)
+        # Simple query expansion
         expanded_queries = [query]
         if key_concepts:
+            expanded_queries.append(f"{query} {' '.join(key_concepts[:2])}")
         return {
             'original_query': query,
             'query_type': query_type,
             'key_concepts': key_concepts,
+            'expanded_queries': expanded_queries[:2]  # Limit to 2 for speed
         }
+    def fast_retrieval(self, query_analysis: Dict[str, Any], top_k: int = 10) -> List[Tuple[Dict[str, Any], float]]:
+        """OPTIMIZED: Fast multi-stage retrieval with minimal overhead"""
         candidates = {}
+        # Stage 1: Dense retrieval with primary query only
+        query = query_analysis['original_query']
+        query_emb = self.create_embedding(query)
+        scores, indices = self.dense_index.search(
+            query_emb.reshape(1, -1),
+            min(top_k * 2, len(self.chunks_data))
+        )
+        for idx, score in zip(indices[0], scores[0]):
+            if idx < len(self.chunks_data):
+                chunk = self.chunks_data[idx]
+                chunk_id = chunk['id']
+                candidates[chunk_id] = {
+                    'chunk': chunk,
+                    'score': float(score) * chunk['importance_score']
+                }
+        # Stage 2: BM25 boost for top candidates
+        if len(candidates) < top_k:
+            query_tokens = query.lower().split()
+            bm25_scores = self.bm25_index.get_scores(query_tokens)
+            top_bm25_indices = np.argsort(bm25_scores)[-top_k:][::-1]
+            for idx in top_bm25_indices:
                 if idx < len(self.chunks_data):
+                    chunk = self.chunks_data[idx]
                     chunk_id = chunk['id']
                     if chunk_id not in candidates:
+                        candidates[chunk_id] = {
+                            'chunk': chunk,
+                            'score': float(bm25_scores[idx]) * 0.3  # Lower weight for BM25
+                        }
                     else:
+                        candidates[chunk_id]['score'] += float(bm25_scores[idx]) * 0.2
+        # Convert to list and sort
+        final_scores = [(data['chunk'], data['score']) for data in candidates.values()]
         final_scores.sort(key=lambda x: x[1], reverse=True)
         return final_scores[:top_k]
+    def generate_fast_answer(self, query: str, retrieved_chunks: List[Tuple[Dict[str, Any], float]]) -> Dict[str, Any]:
+        """Generate answer with minimal overhead"""
         if not self.groq_client:
             return {'error': 'Groq client not initialized'}
+        # Prepare context efficiently
         context_parts = []
+        for i, (chunk, score) in enumerate(retrieved_chunks[:3], 1):  # Limit to top 3 for speed
             context_parts.append(f"""
+Document {i} - Relevance: {score:.2f}
+{chunk['text'][:600]}
+""")
         context = "\n---\n".join(context_parts)
+        system_prompt = """You are a legal AI assistant. Provide concise, accurate answers based ONLY on the provided documents. If information isn't in the documents, state that clearly."""
         user_prompt = f"""Query: {query}
+Documents:
+{context}
+Provide a clear, concise answer based on the documents."""
         try:
             response = self.groq_client.chat.completions.create(
                 ],
                 model="llama-3.1-8b-instant",
                 temperature=0.1,
+                max_tokens=500  # Limit for speed
             )
             answer = response.choices[0].message.content
                         'title': chunk['title'],
                         'section': chunk['section_type'],
                         'relevance_score': float(score),
+                        'text_preview': chunk['text'][:200] + '...',
+                        'entities': [e['text'] for e in chunk['entities'][:3]]
                     }
+                    for chunk, score in retrieved_chunks[:5]
                 ]
             }
         except Exception as e:
+            return {'error': f'Error generating answer: {str(e)}'}
     def query_documents(self, query: str, top_k: int = 5) -> Dict[str, Any]:
+        """OPTIMIZED: Main query function with minimal processing time"""
         if not self.chunks_data:
+            return {'error': f'No documents indexed for session {self.session_id}'}
+        start_time = time.time()
+        # Fast query analysis
+        query_analysis = self.analyze_query_fast(query)
+        # Fast retrieval
+        retrieved_chunks = self.fast_retrieval(query_analysis, top_k)
         if not retrieved_chunks:
             return {
             }
         # Generate answer
+        result = self.generate_fast_answer(query, retrieved_chunks)
         result['query_analysis'] = query_analysis
+        result['processing_time'] = time.time() - start_time
+        logger.info(f"Query processed in {result['processing_time']:.2f}s")
         return result
+# For backward compatibility - replace SessionRAG with OptimizedSessionRAG
+SessionRAG = OptimizedSessionRAG