Spaces:

aproli90
/

query-sql

Sleeping

App Files Files Community

Ashhar commited on Feb 11, 2025

Commit

6d149f9

1 Parent(s): 4380c2b

support multiple tables/views

Browse files

Files changed (1) hide show

app.py +130 -52

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ import os
 import pandas as pd
 from typing import Literal, TypedDict
 from sqlalchemy import create_engine, inspect, text
-import json
 from transformers import AutoTokenizer
 from utils import pprint
 import time
@@ -34,7 +33,7 @@ ModelConfig = TypedDict("ModelConfig", {
 })
 MODEL_CONFIG: dict[ModelType, ModelConfig] = {
-    "CLAUDE": {
         "client": anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")),
         "model": "claude-3-5-haiku-20241022",
         # "model": "claude-3-5-sonnet-20241022",
@@ -42,6 +41,14 @@ MODEL_CONFIG: dict[ModelType, ModelConfig] = {
         "max_context": 40000,
         "tokenizer": AutoTokenizer.from_pretrained("Xenova/claude-tokenizer")
     },
     "GPT_4o": {
         "client": OpenAI(api_key=os.environ.get("OPENAI_API_KEY")),
         "model": "gpt-4o",
@@ -111,7 +118,7 @@ TOOLS_MODEL = MODEL_CONFIG[modelType].get("tools_model") or MODEL
 MAX_CONTEXT = MODEL_CONFIG[modelType]["max_context"]
 tokenizer = MODEL_CONFIG[modelType]["tokenizer"]
-isClaudeModel = modelType == "CLAUDE"
 isDeepSeekModel = modelType.startswith("DEEPSEEK")
@@ -211,7 +218,7 @@ def get_table_schema(table_name):
 def get_sample_data(table_name):
     if not st.session_state.engine:
-        return None
     query = f"SELECT * FROM {table_name} ORDER BY 1 DESC LIMIT 3"
     try:
@@ -219,8 +226,8 @@ def get_sample_data(table_name):
             df = pd.read_sql(query, conn)
             return df
     except Exception as e:
-        st.error(f"Error fetching sample data: {str(e)}")
-        return None
 def clean_sql_response(response: str) -> str:
@@ -254,28 +261,57 @@ def execute_query(query):
 def generate_sql_query(user_query):
-    prompt = f"""You are a SQL expert. Generate a valid PostgreSQL query based on the following context and user query.
-Table Name: {st.session_state.selected_table}
-Table Schema:
-{json.dumps(st.session_state.table_schema, indent=2)}
-Sample Data:
-{st.session_state.sample_data.to_markdown(index=False)}
 Important:
 1. Only return the SQL query, nothing else
 2. The query should be valid PostgreSQL syntax
 3. Do not include any explanations or comments
 4. Make sure to handle NULL values appropriately
-5. Use the table name '{st.session_state.selected_table}' in your query
 User Query: {user_query}
 """
     prompt_tokens = __countTokens(prompt)
-    pprint(f"\n[{MODEL}] Prompt tokens for SQL generation: {prompt_tokens}")
     # Debug prompt in a Streamlit expander for better organization
     # Check if running locally based on Streamlit's origin header
@@ -340,56 +376,98 @@ if st.session_state.connection_string:
     db_objects = [(table, 'Table') for table in tables] + [(view, 'View') for view in views]
     db_objects.sort(key=lambda x: x[0])  # Sort alphabetically by name
-    # Create two columns for the selection
-    col1, col2 = st.columns([3, 1])
-    with col1:
-        # Extract just the names for the selectbox
-        object_names = [obj[0] for obj in db_objects]
-        # Set default index to 'lsq_leads' if present, otherwise 0
-        default_index = object_names.index('lsq_leads') if 'lsq_leads' in object_names else 0
-        selected_object = st.selectbox("Select a table or view", object_names, index=default_index)
-    with col2:
-        # Display the object type (Table/View)
-        object_type = next(obj_type for obj_name, obj_type in db_objects if obj_name == selected_object)
-        st.text_input("Type", value=object_type, disabled=True)
     # Create containers for schema and data
     schema_container = st.container()
     data_container = st.container()
-    # Always load object data if we have a selection
-    if selected_object:
-        # Update session state
-        if selected_object != st.session_state.selected_table:
-            st.session_state.selected_table = selected_object
-        # Always fetch schema and sample data
-        st.session_state.table_schema = get_table_schema(selected_object)
-        st.session_state.sample_data = get_sample_data(selected_object)
-        # Always display schema and sample data if available
         with schema_container:
-            if st.session_state.table_schema:
-                st.subheader("Table Schema")
-                # Force immediate rendering with an empty element
-                st.empty()
-                st.json(st.session_state.table_schema)
         with data_container:
-            if st.session_state.sample_data is not None:
-                st.subheader("Sample Data (Last 3 rows)")
-                # Force immediate rendering with an empty element
-                st.empty()
-                st.dataframe(
-                    st.session_state.sample_data,
-                    use_container_width=True,
-                    hide_index=True
-                )
 # Query Input Section
-if st.session_state.selected_table:
     st.header("3. Query Input")
     user_query = st.text_area("Enter your query in plain English")

 import pandas as pd
 from typing import Literal, TypedDict
 from sqlalchemy import create_engine, inspect, text
 from transformers import AutoTokenizer
 from utils import pprint
 import time
 })
 MODEL_CONFIG: dict[ModelType, ModelConfig] = {
+    "CLAUDE_HAIKU": {
         "client": anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")),
         "model": "claude-3-5-haiku-20241022",
         # "model": "claude-3-5-sonnet-20241022",
         "max_context": 40000,
         "tokenizer": AutoTokenizer.from_pretrained("Xenova/claude-tokenizer")
     },
+    "CLAUDE_SONNET": {
+        "client": anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")),
+        # "model": "claude-3-5-haiku-20241022",
+        # "model": "claude-3-5-sonnet-20241022",
+        "model": "claude-3-5-sonnet-20240620",
+        "max_context": 40000,
+        "tokenizer": AutoTokenizer.from_pretrained("Xenova/claude-tokenizer")
+    },
     "GPT_4o": {
         "client": OpenAI(api_key=os.environ.get("OPENAI_API_KEY")),
         "model": "gpt-4o",
 MAX_CONTEXT = MODEL_CONFIG[modelType]["max_context"]
 tokenizer = MODEL_CONFIG[modelType]["tokenizer"]
+isClaudeModel = modelType.startswith("CLAUDE")
 isDeepSeekModel = modelType.startswith("DEEPSEEK")
 def get_sample_data(table_name):
     if not st.session_state.engine:
+        return pd.DataFrame()  # Return empty DataFrame instead of None
     query = f"SELECT * FROM {table_name} ORDER BY 1 DESC LIMIT 3"
     try:
             df = pd.read_sql(query, conn)
             return df
     except Exception as e:
+        st.error(f"Error fetching sample data for {table_name}: {str(e)}")
+        return pd.DataFrame()  # Return empty DataFrame on error
 def clean_sql_response(response: str) -> str:
 def generate_sql_query(user_query):
+    # Build context for all selected tables
+    tables_context = []
+    for table_name, table_type in st.session_state.selected_tables.items():
+        # Format schema in markdown
+        schema_info = st.session_state.table_schemas[table_name]
+        # Build markdown formatted schema
+        schema_md = [f"\n\n### {table_type}: {table_name}"]
+        # Add table comment if exists
+        if schema_info.get("table_comment"):
+            schema_md.append(f"> {schema_info['table_comment']}")
+        # Add column details
+        schema_md.append("\n**Columns:**")
+        for col_name, col_info in schema_info["columns"].items():
+            col_type = col_info["type"]
+            col_comment = col_info.get("comment")
+            # Format column with type and optional comment
+            if col_comment:
+                schema_md.append(f"- `{col_name}` ({col_type}) - {col_comment}")
+            else:
+                schema_md.append(f"- `{col_name}` ({col_type})")
+        # Add sample data
+        schema_md.append("\n**Sample Data:**")
+        schema_md.append(st.session_state.sample_data[table_name].to_markdown(index=False))
+        # Join all parts with newlines
+        tables_context.append("\n".join(schema_md))
+    prompt = f"""You are a SQL expert. Generate a valid PostgreSQL query based on the following context and user query.
+<AVAILABLE_OBJECTS>
+{chr(10).join(tables_context)}
 Important:
 1. Only return the SQL query, nothing else
 2. The query should be valid PostgreSQL syntax
 3. Do not include any explanations or comments
 4. Make sure to handle NULL values appropriately
+5. If joining tables, use appropriate join conditions based on the schema
+6. Use table names with appropriate qualifiers to avoid ambiguity
 User Query: {user_query}
 """
     prompt_tokens = __countTokens(prompt)
+    print("\n")
+    pprint(f"[{MODEL}] Prompt tokens for SQL generation: {prompt_tokens}")
     # Debug prompt in a Streamlit expander for better organization
     # Check if running locally based on Streamlit's origin header
     db_objects = [(table, 'Table') for table in tables] + [(view, 'View') for view in views]
     db_objects.sort(key=lambda x: x[0])  # Sort alphabetically by name
+    # Extract just the names for the multiselect
+    object_names = [obj[0] for obj in db_objects]
+    # Default to 'lsq_leads' if present
+    default_selections = ['lsq_leads'] if 'lsq_leads' in object_names else []
+    # Create multiselect for table/view selection
+    selected_objects = st.multiselect(
+        "Select tables/views",
+        options=object_names,
+        default=default_selections,
+        help="You can select multiple tables/views to query across them"
+    )
+    # Display selected object types
+    if selected_objects:
+        st.write("Selected objects:")
+        for obj in selected_objects:
+            obj_type = next(obj_type for obj_name, obj_type in db_objects if obj_name == obj)
+            st.write(f"- {obj}: {obj_type}")
     # Create containers for schema and data
     schema_container = st.container()
     data_container = st.container()
+    # Initialize or reset session state for selected objects
+    if selected_objects:
+        # Always ensure dictionaries exist in session state
+        if not isinstance(st.session_state.get("selected_tables"), dict):
+            st.session_state.selected_tables = {}
+        if not isinstance(st.session_state.get("table_schemas"), dict):
+            st.session_state.table_schemas = {}
+        if not isinstance(st.session_state.get("sample_data"), dict):
+            st.session_state.sample_data = {}
+        # Clear previous data for tables that are no longer selected
+        current_tables = set(selected_objects)
+        previous_tables = set(st.session_state.selected_tables.keys())
+        removed_tables = previous_tables - current_tables
+        for table in removed_tables:
+            if table in st.session_state.selected_tables:
+                del st.session_state.selected_tables[table]
+            if table in st.session_state.table_schemas:
+                del st.session_state.table_schemas[table]
+            if table in st.session_state.sample_data:
+                del st.session_state.sample_data[table]
+        # Update session state with new selections
+        for obj in selected_objects:
+            # Update selected tables
+            st.session_state.selected_tables[obj] = next(
+                obj_type for obj_name, obj_type in db_objects if obj_name == obj
+            )
+            # Fetch and store schema
+            schema = get_table_schema(obj)
+            if schema:
+                st.session_state.table_schemas[obj] = schema
+            # Fetch and store sample data
+            sample_data = get_sample_data(obj)
+            if not sample_data.empty:
+                st.session_state.sample_data[obj] = sample_data
+        # Display schema and sample data for each selected object
         with schema_container:
+            st.subheader("Table/View Schemas")
+            for obj in selected_objects:
+                if obj in st.session_state.table_schemas:
+                    st.write(f"**{obj} Schema:**")
+                    st.json(st.session_state.table_schemas[obj])
+                    st.write("---")
+                else:
+                    st.warning(f"Could not fetch schema for {obj}")
         with data_container:
+            st.subheader("Sample Data")
+            for obj in selected_objects:
+                if obj in st.session_state.sample_data and not st.session_state.sample_data[obj].empty:
+                    st.write(f"**{obj} (Last 3 rows):**")
+                    st.dataframe(
+                        st.session_state.sample_data[obj],
+                        use_container_width=True,
+                        hide_index=True
+                    )
+                    st.write("---")
+                else:
+                    st.warning(f"No sample data available for {obj}")
 # Query Input Section
+if st.session_state.get("selected_tables"):
     st.header("3. Query Input")
     user_query = st.text_area("Enter your query in plain English")