Merge branch 'dev' into master

2026-01-19 13:23:16 +08:00 · 2024-01-30 12:42:28 +08:00 · 2024-01-30 12:42:28 +08:00 · 4157201c2c
commit 4157201c2c
parent 25f5f103c1 6eab501baf
9 changed files with 127 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -52,10 +52,11 @@ OpenAI GPT API 的调用，并将在后续持续扩充对各类模型及模型 A

 🚩 本项目未涉及微调、训练过程，但可利用微调或训练对本项目效果进行优化。

-🌐 [AutoDL 镜像](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) 中 `v0.2.10`
+🌐 [AutoDL 镜像](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) 中 `v14`
+
 版本所使用代码已更新至本项目 `v0.2.10` 版本。

-🐳 [Docker 镜像](registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.6) 已经更新到 ```0.2.7``` 版本。
+🐳 [Docker 镜像](registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.7) 已经更新到 ```0.2.7``` 版本。

 🌲 一行命令运行 Docker ：

@ -197,4 +198,4 @@ $ python startup.py -a

 <img src="img/official_wechat_mp_account.png" alt="二维码" width="300" />

-🎉 Langchain-Chatchat 项目官方公众号，欢迎扫码关注。
+🎉 Langchain-Chatchat 项目官方公众号，欢迎扫码关注。
--- a/README_en.md
+++ b/README_en.md
@ -61,7 +61,7 @@ The main process analysis from the aspect of document process:
 🚩 The training or fine-tuning are not involved in the project, but still, one always can improve performance by do
 these.

-🌐 [AutoDL image](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) is supported, and in v13 the codes are update to v0.2.9.
+🌐 [AutoDL image](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) is supported, and in v14 the codes are update to v0.2.10.

 🐳 [Docker image](registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.7) is supported to 0.2.7

@ -191,7 +191,7 @@ please refer to the [Wiki](https://github.com/chatchat-space/Langchain-Chatchat/

 ### WeChat Group

-<img src="img/qr_code_87.jpg" alt="二维码" width="300" height="300" />
+<img src="img/qr_code_88.jpg" alt="二维码" width="300" height="300" />

 ### WeChat Official Account

--- a/README_ja.md
+++ b/README_ja.md
@ -151,7 +151,7 @@ $ python startup.py -a

 ### WeChat グループ

-<img src="img/qr_code_67.jpg" alt="二维码" width="300" height="300" />
+<img src="img/qr_code_88.jpg" alt="二维码" width="300" height="300" />

 ### WeChat 公式アカウント

--- a/configs/kb_config.py.example
+++ b/configs/kb_config.py.example
@ -3,7 +3,7 @@ import os
 # 默认使用的知识库
 DEFAULT_KNOWLEDGE_BASE = "samples"

-# 默认向量库/全文检索引擎类型。可选：faiss, milvus(离线) & zilliz(在线), pgvector,全文检索引擎es
+# 默认向量库/全文检索引擎类型。可选：faiss, milvus(离线) & zilliz(在线), pgvector, 全文检索引擎es, chromadb
 DEFAULT_VS_TYPE = "faiss"

 # 缓存向量库数量（针对FAISS）
@ -110,7 +110,8 @@ kbs_config = {
    "milvus_kwargs":{
        "search_params":{"metric_type": "L2"}, #在此处增加search_params
        "index_params":{"metric_type": "L2","index_type": "HNSW"} # 在此处增加index_params
-    }
+    },
+    "chromadb": {}
 }

 # TextSplitter配置项，如果你不明白其中的含义，就不要修改。
--- a/requirements.txt
+++ b/requirements.txt
@ -47,6 +47,7 @@ llama-index==0.9.35
 # pymilvus==2.3.4
 # psycopg2==2.9.9
 # pgvector==0.2.4
+# chromadb==0.4.13
 #flash-attn==2.4.2 # For Orion-14B-Chat and Qwen-14B-Chat
 #autoawq==0.1.8 # For Int4
 #rapidocr_paddle[gpu]==1.3.11 # gpu accelleration for ocr of pdf and image files
@ -63,4 +64,4 @@ streamlit-modal==0.1.0
 streamlit-aggrid==0.3.4.post3
 httpx==0.26.0
 watchdog==3.0.0
-jwt==1.3.1
+pyjwt==2.8.0
--- a/requirements_api.txt
+++ b/requirements_api.txt
@ -38,6 +38,7 @@ transformers_stream_generator==0.0.4
 vllm==0.2.7; sys_platform == "linux"
 httpx==0.26.0
 llama-index==0.9.35
+pyjwt==2.8.0

 # jq==1.6.0
 # beautifulsoup4~=4.12.2
--- a/requirements_lite.txt
+++ b/requirements_lite.txt
@ -20,6 +20,7 @@ requests~=2.31.0
 pathlib~=1.0.1
 pytest~=7.4.3
 llama-index==0.9.35
+pyjwt==2.8.0

 dashscope==1.13.6
 arxiv~=2.1.0
@ -30,4 +31,5 @@ watchdog~=3.0.0
 # volcengine>=1.0.119
 # pymilvus>=2.3.4
 # psycopg2==2.9.9
-# pgvector>=0.2.4
+# pgvector>=0.2.4
+# chromadb==0.4.13
--- a/server/knowledge_base/kb_service/base.py
+++ b/server/knowledge_base/kb_service/base.py
@ -47,6 +47,7 @@ class SupportedVSType:
    ZILLIZ = 'zilliz'
    PG = 'pg'
    ES = 'es'
+    CHROMADB = 'chromadb'


 class KBService(ABC):
@ -319,6 +320,9 @@ class KBServiceFactory:
        elif SupportedVSType.ES == vector_store_type:
            from server.knowledge_base.kb_service.es_kb_service import ESKBService
            return ESKBService(kb_name, embed_model=embed_model)
+        elif SupportedVSType.CHROMADB == vector_store_type:
+            from server.knowledge_base.kb_service.chromadb_kb_service import ChromaKBService
+            return ChromaKBService(kb_name, embed_model=embed_model)
        elif SupportedVSType.DEFAULT == vector_store_type:  # kb_exists of default kbservice is False, to make validation easier.
            from server.knowledge_base.kb_service.default_kb_service import DefaultKBService
            return DefaultKBService(kb_name)
--- a/server/knowledge_base/kb_service/chromadb_kb_service.py
+++ b/server/knowledge_base/kb_service/chromadb_kb_service.py
@ -0,0 +1,107 @@
+import uuid
+from typing import Any, Dict, List, Tuple
+
+import chromadb
+from chromadb.api.types import (GetResult, QueryResult)
+from langchain.docstore.document import Document
+
+from configs import SCORE_THRESHOLD
+from server.knowledge_base.kb_service.base import (EmbeddingsFunAdapter,
+                                                   KBService, SupportedVSType)
+from server.knowledge_base.utils import KnowledgeFile, get_kb_path, get_vs_path
+
+
+def _get_result_to_documents(get_result: GetResult) -> List[Document]:
+    if not get_result['documents']:
+        return []
+
+    _metadatas = get_result['metadatas'] if get_result['metadatas'] else [{}] * len(get_result['documents']) 
+
+    document_list = []
+    for page_content, metadata in zip(get_result['documents'], _metadatas):
+        document_list.append(Document(**{'page_content': page_content, 'metadata': metadata}))
+
+    return document_list
+
+def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
+    """
+    from langchain_community.vectorstores.chroma import Chroma
+    """
+    return [
+        # TODO: Chroma can do batch querying,
+        # we shouldn't hard code to the 1st result
+        (Document(page_content=result[0], metadata=result[1] or {}), result[2])
+        for result in zip(
+            results["documents"][0],
+            results["metadatas"][0],
+            results["distances"][0],
+        )
+    ]
+
+
+class ChromaKBService(KBService):
+
+    vs_path: str
+    kb_path: str
+
+    client = None
+    collection = None
+
+
+    def vs_type(self) -> str:
+        return SupportedVSType.CHROMADB
+
+    def get_vs_path(self) -> str:
+        return get_vs_path(self.kb_name, self.embed_model)
+
+    def get_kb_path(self) -> str:
+        return get_kb_path(self.kb_name)
+
+    def do_init(self) -> None:
+        self.kb_path = self.get_kb_path()
+        self.vs_path = self.get_vs_path()
+        self.client = chromadb.PersistentClient(path=self.vs_path)
+        self.collection = self.client.get_or_create_collection(self.kb_name)
+
+    def do_create_kb(self) -> None:
+        # In ChromaDB, creating a KB is equivalent to creating a collection
+        self.collection = self.client.get_or_create_collection(self.kb_name)
+
+    def do_drop_kb(self):
+        # Dropping a KB is equivalent to deleting a collection in ChromaDB
+        try:
+            self.client.delete_collection(self.kb_name)
+        except ValueError as e:
+            if not str(e) == f"Collection {self.kb_name} does not exist.":
+                raise e
+
+    def do_search(self, query: str, top_k: int, score_threshold: float = SCORE_THRESHOLD) -> List[Tuple[Document, float]]:
+        embed_func = EmbeddingsFunAdapter(self.embed_model)
+        embeddings = embed_func.embed_query(query)
+        query_result: QueryResult = self.collection.query(query_embeddings=embeddings, n_results=top_k)
+        return  _results_to_docs_and_scores(query_result)
+
+    def do_add_doc(self, docs: List[Document], **kwargs) -> List[Dict]:
+        doc_infos = []
+        data = self._docs_to_embeddings(docs)
+        print(data)
+        ids = [str(uuid.uuid1()) for _ in range(len(data["texts"]))]
+        for _id, text, embedding, metadata in zip(ids, data["texts"], data["embeddings"], data["metadatas"]):
+            self.collection.add(ids=_id, embeddings=embedding, metadatas=metadata, documents=text)
+            doc_infos.append({"id": _id, "metadata": metadata})
+        return doc_infos
+
+    def get_doc_by_ids(self, ids: List[str]) -> List[Document]:
+        get_result: GetResult = self.collection.get(ids=ids)
+        return _get_result_to_documents(get_result)
+
+    def del_doc_by_ids(self, ids: List[str]) -> bool:
+        self.collection.delete(ids=ids)
+        return True
+
+    def do_clear_vs(self):
+        # Clearing the vector store might be equivalent to dropping and recreating the collection
+        self.do_drop_kb()
+
+    def do_delete_doc(self, kb_file: KnowledgeFile, **kwargs):
+        return self.collection.delete(where={"source": kb_file.filepath})