max-unfinity · TheoLisin · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024 · Jan 22, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,5 @@
 developer-portal
 supervisely-dev-portal-db
 chromadb
-__pycache__
+__pycache__
+.python-version
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -5,14 +5,28 @@
     "version": "0.2.0",
     "configurations": [
         {
-            "name": "Streamlit",
+            "name": "Supervisely",
             "type": "python",
             "request": "launch",
             "module": "streamlit",
             "console": "integratedTerminal",
             "args": [
                 "run",
-                "chat_app.py"
+                "docsrag/app/chat_app.py"
+            ],
+            "env": {
+                "STREAMLIT_SERVER_PORT": "8501"
+            },
+        },
+        {
+            "name": "App v1",
+            "type": "python",
+            "request": "launch",
+            "module": "streamlit",
+            "console": "integratedTerminal",
+            "args": [
+                "run",
+                "docsrag/app/chat_app_v1.py"
             ],
             "env": {
                 "STREAMLIT_SERVER_PORT": "8501"

diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+1. Install poetry
+2. Install packages: `poetry install`
+3. Activate venv: `poetry shell`
diff --git a/chat_app_v1.py b/chat_app_v1.py
diff --git a/chat_app.py → docsrag/app/chat_app.py b/chat_app.py → docsrag/app/chat_app.py
@@ -1,9 +1,21 @@
-import streamlit as st
-from dotenv import load_dotenv
 import json
 import os
-from src.chromadb import read_db, get_retriever
-from src.chain import get_chain, retrieve_docs, generate_response, generate_response_stream
+
+import streamlit as st
+from dotenv import load_dotenv
+
+from docsrag.app.utils import upd_sqlite_version
+from docsrag.rag.chain import (
+    generate_response,
+    generate_response_stream,
+    get_chain,
+    retrieve_docs,
+)
+from docsrag.rag.chromadb import get_retriever, read_db
+
+# Uncomment if truobles with sqlite3 version
+# More info: https://docs.trychroma.com/troubleshooting#sqlite"
+upd_sqlite_version()
 
 
 def get_sources(docs):
@@ -30,8 +42,10 @@ def get_sources(docs):
     load_dotenv(".env")
     with open("config.json") as f:
         config = json.load(f)
+
     st.session_state.config = config
     st.session_state.vectorstore = read_db("supervisely-dev-portal-db")
+
     with open("models.json") as f:
         models = json.load(f)
     st.session_state.models = models
@@ -85,9 +99,9 @@ def get_sources(docs):
             # Add a blinking cursor to simulate typing
             message_placeholder.markdown(response + "▌")
         message_placeholder.markdown(response)
-    
+
     response = references + "\n\n" + response
     # Add user message to chat history
     st.session_state.messages.append({"role": "user", "content": prompt})
     # Add assistant response to chat history
-    st.session_state.messages.append({"role": "assistant", "content": response})
+    st.session_state.messages.append({"role": "assistant", "content": response})
diff --git a/docsrag/app/chat_app_v1.py b/docsrag/app/chat_app_v1.py
@@ -0,0 +1,94 @@
+import streamlit as st
+import time
+import hashlib
+
+from pathlib import Path
+from streamlit_tree_select import tree_select
+
+from docsrag.rag.docs_reader import docs_tree, copy_files_to_folder_if_not_exists
+from docsrag.app.utils import prepare_vectorstore, hash_from_str_list, upd_sqlite_version
+
+
+# Uncomment if truobles with sqlite3 version
+# More info: https://docs.trychroma.com/troubleshooting#sqlite"
+upd_sqlite_version()
+
+# Function to simulate chatbot response (to be replaced with actual retrieval logic)
+def get_bot_response(user_input):
+    time.sleep(1)  # Simulating processing time
+    return f"Echoing '{user_input}'"
+
+def send_message():
+    user_input = st.session_state.input
+    if user_input:
+        # Update chat history with the user's message
+        st.session_state.chat_history.append(f"{user_input}")
+
+        # Get and display bot response
+        bot_response = get_bot_response(user_input)
+        st.session_state.chat_history.append(bot_response)
+
+        # Clear the input box after sending the message
+        # st.session_state.input = ""
+
+        # Rerun the app to update the chat display
+        # st.experimental_rerun()
+
+
+def load_files_tree(url: str):
+    name = url.split("/")[-1]
+    path = f"./{name}"
+    nodes = docs_tree(url, path)
+    st.session_state.nodes = [{"label": name, "value": path, "children": nodes}]
+
+
+st.session_state.nodes = []
+
+st.title("Retrieval Chatbot")
+
+# Chat history is stored in a session state to persist over reruns
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+
+# Layout: Chat display area and Input area
+expandable_settings = st.expander("Settings")
+chat_container = st.container()
+input_container = st.container()
+
+
+# prepare vectorstore
+with expandable_settings:
+    url_input = st.text_input("Enter docs URL", placeholder="https://github.com/tiangolo/fastapi")
+    if url_input:
+        load_files_tree(url_input)
+        name = st.session_state.nodes[0]["label"]
+
+    with st.form("select_data"):
+        selected_data = tree_select(st.session_state.nodes)
+        load_new_data_button = st.form_submit_button("Upload")
+
+        if load_new_data_button:
+            text_files = [p for p in selected_data['checked'] if p.endswith(".md")]
+            cur_hash = hash_from_str_list(text_files)
+            hash_name = f"{name}_{cur_hash}"
+            raw_data_path = Path(".") / hash_name / "raw_data"
+            # insert loading a bar somehow
+            with st.spinner("Preparing files..."):
+                copy_files_to_folder_if_not_exists(
+                    src_files=text_files, dst=raw_data_path
+                )
+            with st.spinner("Preparing vectorstore..."):
+                vectorstore = prepare_vectorstore(hash_name, raw_data_path)
+
+# Chat display area
+with chat_container:
+    for i, message in enumerate(st.session_state.chat_history):
+        if i % 2 == 0:
+            st.info(message, icon="👤")
+        else:
+            st.success(message, icon="🤖")
+
+
+# Input area - with user input and send button
+# with input_container:
+user_input = st.chat_input(placeholder="Type your message here...", key="input", on_submit=send_message)
diff --git a/docsrag/app/utils.py b/docsrag/app/utils.py
@@ -0,0 +1,30 @@
+import os
+from pathlib import Path
+import sys
+import hashlib
+from importlib import import_module
+from typing import List, Optional
+from langchain.vectorstores.chroma import Chroma
+
+from docsrag.rag.docs_reader import read_docs
+from docsrag.rag.chromadb import read_db, build_db
+
+
+def upd_sqlite_version() -> None:
+    import_module("pysqlite3")
+    sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
+
+
+def prepare_vectorstore(name: str, docs_path: Optional[str] = None, device: Optional[int] = None) -> Chroma:
+    if (Path(".") / name / "chroma.sqlite3").exists():
+        return read_db(persist_directory=name, device=device)
+    elif docs_path is None:
+        raise ValueError(f"docs_path can't be None: vectorstore {name} doesn't exists.")
+
+    splits = read_docs(docs_path)
+    return build_db(splits, name, device)
+
+
+def hash_from_str_list(strings: List[str]) -> str:
+    enc_strings = map(str.encode, strings)
+    return hashlib.sha256(b"".join(enc_strings)).hexdigest()
diff --git a/src/chain.py → docsrag/rag/chain.py b/src/chain.py → docsrag/rag/chain.py
@@ -1,11 +1,11 @@
 from operator import itemgetter
 from typing import Sequence
 
-from langchain_openai import ChatOpenAI
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema.document import Document
 from langchain.schema.output_parser import StrOutputParser
 from langchain.schema.runnable.passthrough import RunnableAssign
+from langchain_openai import ChatOpenAI
 
 
 # After the retriever fetches documents, this
@@ -45,12 +45,12 @@ def get_chain(retriever, model="gpt-3.5-turbo-1106", temperature=0.7):
                 Here are texts from Superivsely SDK documentation:
                 {context}
                 Answer questions based on the provided texts. If provided texts are not relevant, mention it to the user and answer based on your knowledge.
-                """
+                """,
             ),
             (
                 "human",
                 """The question is:
-                {question}"""
+                {question}""",
             ),
         ]
     )
@@ -67,9 +67,9 @@ def get_chain(retriever, model="gpt-3.5-turbo-1106", temperature=0.7):
     chain = (
         RunnableAssign(
             {
-                "context": (itemgetter("question") | retriever | format_docs).with_config(
-                    run_name="FormatDocs"
-                )
+                "context": (
+                    itemgetter("question") | retriever | format_docs
+                ).with_config(run_name="FormatDocs")
             }
         )
         # The "RunnableAssign" above returns a dict with keys
@@ -93,4 +93,4 @@ def generate_response(chain, prompt):
 
 def generate_response_stream(chain, prompt):
     response_stream = chain.stream({"question": prompt})
-    return response_stream
+    return response_stream
diff --git a/src/chromadb.py → docsrag/rag/chromadb.py b/src/chromadb.py → docsrag/rag/chromadb.py
@@ -1,11 +1,17 @@
-from langchain.embeddings import HuggingFaceEmbeddings
+from typing import Optional
 from langchain.vectorstores.chroma import Chroma
+from langchain_community.embeddings import HuggingFaceEmbeddings
 
 
-def build_db(docs, persist_directory):
+def build_db(docs, persist_directory, device: Optional[int] = None) -> Chroma:
+    model_kwargs={}
+
+    if device is not None:
+        model_kwargs["device"] = device
+
     embeddings = HuggingFaceEmbeddings(
         model_name="thenlper/gte-base",
-        model_kwargs={"device": 0},  # Comment out to use CPU
+        model_kwargs=model_kwargs,
     )
     vectorstore = Chroma(
         embedding_function=embeddings,
@@ -15,10 +21,15 @@ def build_db(docs, persist_directory):
     return vectorstore
 
 
-def read_db(persist_directory):
+def read_db(persist_directory, device: Optional[int] = None) -> Chroma:
+    model_kwargs={}
+
+    if device is not None:
+        model_kwargs["device"] = device
+
     embeddings = HuggingFaceEmbeddings(
         model_name="thenlper/gte-base",
-        model_kwargs={"device": 0},  # Comment out to use CPU
+        model_kwargs=model_kwargs,
     )
     vectorstore = Chroma(
         embedding_function=embeddings,
@@ -29,4 +40,4 @@ def read_db(persist_directory):
 
 def get_retriever(vectorstore, k):
     retriever = vectorstore.as_retriever(search_kwargs={"k": k})
-    return retriever
+    return retriever