diff --git a/README.md b/README.md index be8898b2..78ec0691 100644 --- a/README.md +++ b/README.md @@ -52,10 +52,11 @@ OpenAI GPT API 的调用,并将在后续持续扩充对各类模型及模型 A 🚩 本项目未涉及微调、训练过程,但可利用微调或训练对本项目效果进行优化。 -🌐 [AutoDL 镜像](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) 中 `v0.2.10` +🌐 [AutoDL 镜像](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) 中 `v14` + 版本所使用代码已更新至本项目 `v0.2.10` 版本。 -🐳 [Docker 镜像](registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.6) 已经更新到 ```0.2.7``` 版本。 +🐳 [Docker 镜像](registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.7) 已经更新到 ```0.2.7``` 版本。 🌲 一行命令运行 Docker : @@ -197,4 +198,4 @@ $ python startup.py -a 二维码 -🎉 Langchain-Chatchat 项目官方公众号,欢迎扫码关注。 +🎉 Langchain-Chatchat 项目官方公众号,欢迎扫码关注。 \ No newline at end of file diff --git a/README_en.md b/README_en.md index fca63ea9..2a158fb4 100644 --- a/README_en.md +++ b/README_en.md @@ -61,7 +61,7 @@ The main process analysis from the aspect of document process: 🚩 The training or fine-tuning are not involved in the project, but still, one always can improve performance by do these. -🌐 [AutoDL image](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) is supported, and in v13 the codes are update to v0.2.9. +🌐 [AutoDL image](https://www.codewithgpu.com/i/chatchat-space/Langchain-Chatchat/Langchain-Chatchat) is supported, and in v14 the codes are update to v0.2.10. 🐳 [Docker image](registry.cn-beijing.aliyuncs.com/chatchat/chatchat:0.2.7) is supported to 0.2.7 @@ -191,7 +191,7 @@ please refer to the [Wiki](https://github.com/chatchat-space/Langchain-Chatchat/ ### WeChat Group -二维码 +二维码 ### WeChat Official Account diff --git a/README_ja.md b/README_ja.md index 1087b268..b2023c46 100644 --- a/README_ja.md +++ b/README_ja.md @@ -151,7 +151,7 @@ $ python startup.py -a ### WeChat グループ -二维码 +二维码 ### WeChat 公式アカウント diff --git a/configs/kb_config.py.example b/configs/kb_config.py.example index 23e06bdc..5cd84e97 100644 --- a/configs/kb_config.py.example +++ b/configs/kb_config.py.example @@ -3,7 +3,7 @@ import os # 默认使用的知识库 DEFAULT_KNOWLEDGE_BASE = "samples" -# 默认向量库/全文检索引擎类型。可选:faiss, milvus(离线) & zilliz(在线), pgvector,全文检索引擎es +# 默认向量库/全文检索引擎类型。可选:faiss, milvus(离线) & zilliz(在线), pgvector, 全文检索引擎es, chromadb DEFAULT_VS_TYPE = "faiss" # 缓存向量库数量(针对FAISS) @@ -110,7 +110,8 @@ kbs_config = { "milvus_kwargs":{ "search_params":{"metric_type": "L2"}, #在此处增加search_params "index_params":{"metric_type": "L2","index_type": "HNSW"} # 在此处增加index_params - } + }, + "chromadb": {} } # TextSplitter配置项,如果你不明白其中的含义,就不要修改。 diff --git a/requirements.txt b/requirements.txt index adf32a76..7912ea1d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,7 @@ llama-index==0.9.35 # pymilvus==2.3.4 # psycopg2==2.9.9 # pgvector==0.2.4 +# chromadb==0.4.13 #flash-attn==2.4.2 # For Orion-14B-Chat and Qwen-14B-Chat #autoawq==0.1.8 # For Int4 #rapidocr_paddle[gpu]==1.3.11 # gpu accelleration for ocr of pdf and image files @@ -63,4 +64,4 @@ streamlit-modal==0.1.0 streamlit-aggrid==0.3.4.post3 httpx==0.26.0 watchdog==3.0.0 -jwt==1.3.1 \ No newline at end of file +pyjwt==2.8.0 diff --git a/requirements_api.txt b/requirements_api.txt index 4d2ef018..e23cd109 100644 --- a/requirements_api.txt +++ b/requirements_api.txt @@ -38,6 +38,7 @@ transformers_stream_generator==0.0.4 vllm==0.2.7; sys_platform == "linux" httpx==0.26.0 llama-index==0.9.35 +pyjwt==2.8.0 # jq==1.6.0 # beautifulsoup4~=4.12.2 diff --git a/requirements_lite.txt b/requirements_lite.txt index 6019cefb..aae3a284 100644 --- a/requirements_lite.txt +++ b/requirements_lite.txt @@ -20,6 +20,7 @@ requests~=2.31.0 pathlib~=1.0.1 pytest~=7.4.3 llama-index==0.9.35 +pyjwt==2.8.0 dashscope==1.13.6 arxiv~=2.1.0 @@ -30,4 +31,5 @@ watchdog~=3.0.0 # volcengine>=1.0.119 # pymilvus>=2.3.4 # psycopg2==2.9.9 -# pgvector>=0.2.4 \ No newline at end of file +# pgvector>=0.2.4 +# chromadb==0.4.13 diff --git a/server/knowledge_base/kb_service/base.py b/server/knowledge_base/kb_service/base.py index bd5a54eb..89e30620 100644 --- a/server/knowledge_base/kb_service/base.py +++ b/server/knowledge_base/kb_service/base.py @@ -47,6 +47,7 @@ class SupportedVSType: ZILLIZ = 'zilliz' PG = 'pg' ES = 'es' + CHROMADB = 'chromadb' class KBService(ABC): @@ -319,6 +320,9 @@ class KBServiceFactory: elif SupportedVSType.ES == vector_store_type: from server.knowledge_base.kb_service.es_kb_service import ESKBService return ESKBService(kb_name, embed_model=embed_model) + elif SupportedVSType.CHROMADB == vector_store_type: + from server.knowledge_base.kb_service.chromadb_kb_service import ChromaKBService + return ChromaKBService(kb_name, embed_model=embed_model) elif SupportedVSType.DEFAULT == vector_store_type: # kb_exists of default kbservice is False, to make validation easier. from server.knowledge_base.kb_service.default_kb_service import DefaultKBService return DefaultKBService(kb_name) diff --git a/server/knowledge_base/kb_service/chromadb_kb_service.py b/server/knowledge_base/kb_service/chromadb_kb_service.py new file mode 100644 index 00000000..160c4c70 --- /dev/null +++ b/server/knowledge_base/kb_service/chromadb_kb_service.py @@ -0,0 +1,107 @@ +import uuid +from typing import Any, Dict, List, Tuple + +import chromadb +from chromadb.api.types import (GetResult, QueryResult) +from langchain.docstore.document import Document + +from configs import SCORE_THRESHOLD +from server.knowledge_base.kb_service.base import (EmbeddingsFunAdapter, + KBService, SupportedVSType) +from server.knowledge_base.utils import KnowledgeFile, get_kb_path, get_vs_path + + +def _get_result_to_documents(get_result: GetResult) -> List[Document]: + if not get_result['documents']: + return [] + + _metadatas = get_result['metadatas'] if get_result['metadatas'] else [{}] * len(get_result['documents']) + + document_list = [] + for page_content, metadata in zip(get_result['documents'], _metadatas): + document_list.append(Document(**{'page_content': page_content, 'metadata': metadata})) + + return document_list + +def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]: + """ + from langchain_community.vectorstores.chroma import Chroma + """ + return [ + # TODO: Chroma can do batch querying, + # we shouldn't hard code to the 1st result + (Document(page_content=result[0], metadata=result[1] or {}), result[2]) + for result in zip( + results["documents"][0], + results["metadatas"][0], + results["distances"][0], + ) + ] + + +class ChromaKBService(KBService): + + vs_path: str + kb_path: str + + client = None + collection = None + + + def vs_type(self) -> str: + return SupportedVSType.CHROMADB + + def get_vs_path(self) -> str: + return get_vs_path(self.kb_name, self.embed_model) + + def get_kb_path(self) -> str: + return get_kb_path(self.kb_name) + + def do_init(self) -> None: + self.kb_path = self.get_kb_path() + self.vs_path = self.get_vs_path() + self.client = chromadb.PersistentClient(path=self.vs_path) + self.collection = self.client.get_or_create_collection(self.kb_name) + + def do_create_kb(self) -> None: + # In ChromaDB, creating a KB is equivalent to creating a collection + self.collection = self.client.get_or_create_collection(self.kb_name) + + def do_drop_kb(self): + # Dropping a KB is equivalent to deleting a collection in ChromaDB + try: + self.client.delete_collection(self.kb_name) + except ValueError as e: + if not str(e) == f"Collection {self.kb_name} does not exist.": + raise e + + def do_search(self, query: str, top_k: int, score_threshold: float = SCORE_THRESHOLD) -> List[Tuple[Document, float]]: + embed_func = EmbeddingsFunAdapter(self.embed_model) + embeddings = embed_func.embed_query(query) + query_result: QueryResult = self.collection.query(query_embeddings=embeddings, n_results=top_k) + return _results_to_docs_and_scores(query_result) + + def do_add_doc(self, docs: List[Document], **kwargs) -> List[Dict]: + doc_infos = [] + data = self._docs_to_embeddings(docs) + print(data) + ids = [str(uuid.uuid1()) for _ in range(len(data["texts"]))] + for _id, text, embedding, metadata in zip(ids, data["texts"], data["embeddings"], data["metadatas"]): + self.collection.add(ids=_id, embeddings=embedding, metadatas=metadata, documents=text) + doc_infos.append({"id": _id, "metadata": metadata}) + return doc_infos + + def get_doc_by_ids(self, ids: List[str]) -> List[Document]: + get_result: GetResult = self.collection.get(ids=ids) + return _get_result_to_documents(get_result) + + def del_doc_by_ids(self, ids: List[str]) -> bool: + self.collection.delete(ids=ids) + return True + + def do_clear_vs(self): + # Clearing the vector store might be equivalent to dropping and recreating the collection + self.do_drop_kb() + + def do_delete_doc(self, kb_file: KnowledgeFile, **kwargs): + return self.collection.delete(where={"source": kb_file.filepath})