From 042a70c09a727a4f334e3e9004be86f17de5c35b Mon Sep 17 00:00:00 2001 From: zR <2448370773@qq.com> Date: Tue, 30 Jan 2024 14:18:14 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E4=BA=86chromadb=E7=9A=84?= =?UTF-8?q?=E6=89=93=E5=8D=B0=E7=9A=84=E7=AC=A6=E5=8F=B7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- configs/kb_config.py.example | 2 +- requirements.txt | 5 +++-- requirements_api.txt | 4 +++- requirements_lite.txt | 1 + .../knowledge_base/kb_service/chromadb_kb_service.py | 12 +++++------- .../knowledge_base/kb_service/milvus_kb_service.py | 2 ++ 6 files changed, 15 insertions(+), 11 deletions(-) diff --git a/configs/kb_config.py.example b/configs/kb_config.py.example index 5cd84e97..00a12991 100644 --- a/configs/kb_config.py.example +++ b/configs/kb_config.py.example @@ -3,7 +3,7 @@ import os # 默认使用的知识库 DEFAULT_KNOWLEDGE_BASE = "samples" -# 默认向量库/全文检索引擎类型。可选:faiss, milvus(离线) & zilliz(在线), pgvector, 全文检索引擎es, chromadb +# 默认向量库/全文检索引擎类型。可选:faiss, milvus(离线) & zilliz(在线), pgvector, chromadb 全文检索引擎es DEFAULT_VS_TYPE = "faiss" # 缓存向量库数量(针对FAISS) diff --git a/requirements.txt b/requirements.txt index 57de4038..e3b036a6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -44,10 +44,11 @@ llama-index==0.9.35 # dashscope==1.13.6 # qwen # volcengine==1.0.119 # fangzhou # uncomment libs if you want to use corresponding vector store -# pymilvus==2.3.4 +# pymilvus==2.3.6 # psycopg2==2.9.9 -# pgvector==0.2.4 +# pgvector>=0.2.4 # chromadb==0.4.13 + #flash-attn==2.4.2 # For Orion-14B-Chat and Qwen-14B-Chat #autoawq==0.1.8 # For Int4 #rapidocr_paddle[gpu]==1.3.11 # gpu accelleration for ocr of pdf and image files diff --git a/requirements_api.txt b/requirements_api.txt index d495ee25..2a785204 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -50,9 +50,11 @@ pyjwt==2.8.0 # metaphor-python~=0.1.23 # volcengine>=1.0.119 -# pymilvus>=2.3.4 +# pymilvus==2.3.6 # psycopg2==2.9.9 # pgvector>=0.2.4 +# chromadb==0.4.13 + #flash-attn==2.4.2 # For Orion-14B-Chat and Qwen-14B-Chat #autoawq==0.1.8 # For Int4 #rapidocr_paddle[gpu]==1.3.11 # gpu accelleration for ocr of pdf and image files \ No newline at end of file diff --git a/requirements_lite.txt b/requirements_lite.txt index 8cd427d4..87e48f48 100644 --- a/requirements_lite.txt +++ b/requirements_lite.txt @@ -33,6 +33,7 @@ watchdog~=3.0.0 # psycopg2==2.9.9 # pgvector>=0.2.4 # chromadb==0.4.13 + # jq==1.6.0 # beautifulsoup4~=4.12.2 # pysrt~=1.1.2 diff --git a/server/knowledge_base/kb_service/chromadb_kb_service.py b/server/knowledge_base/kb_service/chromadb_kb_service.py index 160c4c70..5e1d746c 100644 --- a/server/knowledge_base/kb_service/chromadb_kb_service.py +++ b/server/knowledge_base/kb_service/chromadb_kb_service.py @@ -15,7 +15,7 @@ def _get_result_to_documents(get_result: GetResult) -> List[Document]: if not get_result['documents']: return [] - _metadatas = get_result['metadatas'] if get_result['metadatas'] else [{}] * len(get_result['documents']) + _metadatas = get_result['metadatas'] if get_result['metadatas'] else [{}] * len(get_result['documents']) document_list = [] for page_content, metadata in zip(get_result['documents'], _metadatas): @@ -23,13 +23,13 @@ def _get_result_to_documents(get_result: GetResult) -> List[Document]: return document_list + def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: """ from langchain_community.vectorstores.chroma import Chroma """ return [ # TODO: Chroma can do batch querying, - # we shouldn't hard code to the 1st result (Document(page_content=result[0], metadata=result[1] or {}), result[2]) for result in zip( results["documents"][0], @@ -40,14 +40,12 @@ def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: class ChromaKBService(KBService): - vs_path: str kb_path: str client = None collection = None - def vs_type(self) -> str: return SupportedVSType.CHROMADB @@ -75,16 +73,16 @@ class ChromaKBService(KBService): if not str(e) == f"Collection {self.kb_name} does not exist.": raise e - def do_search(self, query: str, top_k: int, score_threshold: float = SCORE_THRESHOLD) -> List[Tuple[Document, float]]: + def do_search(self, query: str, top_k: int, score_threshold: float = SCORE_THRESHOLD) -> List[ + Tuple[Document, float]]: embed_func = EmbeddingsFunAdapter(self.embed_model) embeddings = embed_func.embed_query(query) query_result: QueryResult = self.collection.query(query_embeddings=embeddings, n_results=top_k) - return _results_to_docs_and_scores(query_result) + return _results_to_docs_and_scores(query_result) def do_add_doc(self, docs: List[Document], **kwargs) -> List[Dict]: doc_infos = [] data = self._docs_to_embeddings(docs) - print(data) ids = [str(uuid.uuid1()) for _ in range(len(data["texts"]))] for _id, text, embedding, metadata in zip(ids, data["texts"], data["embeddings"], data["metadatas"]): self.collection.add(ids=_id, embeddings=embedding, metadatas=metadata, documents=text) diff --git a/server/knowledge_base/kb_service/milvus_kb_service.py b/server/knowledge_base/kb_service/milvus_kb_service.py index 43b616e2..ffe31ee0 100644 --- a/server/knowledge_base/kb_service/milvus_kb_service.py +++ b/server/knowledge_base/kb_service/milvus_kb_service.py @@ -2,6 +2,7 @@ from typing import List, Dict, Optional from langchain.schema import Document from langchain.vectorstores.milvus import Milvus +import os from configs import kbs_config @@ -85,6 +86,7 @@ class MilvusKBService(KBService): def do_delete_doc(self, kb_file: KnowledgeFile, **kwargs): if self.milvus.col: filepath = kb_file.filepath.replace('\\', '\\\\') + filename = os.path.basename(filepath) delete_list = [item.get("pk") for item in self.milvus.col.query(expr=f'source == "{filepath}"', output_fields=["pk"])] self.milvus.col.delete(expr=f'pk in {delete_list}')