From 2f2221ca478f120ad8cb60f56ab809ce44fca378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BD=A0=E7=9A=84=E4=BB=A3=E7=A0=81TT?= <115431886+showmecodett@users.noreply.github.com> Date: Mon, 15 Apr 2024 21:37:32 +0800 Subject: [PATCH] Add document normalization in Chroma. (#3640) --- server/knowledge_base/kb_service/chromadb_kb_service.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/server/knowledge_base/kb_service/chromadb_kb_service.py b/server/knowledge_base/kb_service/chromadb_kb_service.py index 5e1d746c..02e9c60e 100644 --- a/server/knowledge_base/kb_service/chromadb_kb_service.py +++ b/server/knowledge_base/kb_service/chromadb_kb_service.py @@ -82,9 +82,12 @@ class ChromaKBService(KBService): def do_add_doc(self, docs: List[Document], **kwargs) -> List[Dict]: doc_infos = [] - data = self._docs_to_embeddings(docs) - ids = [str(uuid.uuid1()) for _ in range(len(data["texts"]))] - for _id, text, embedding, metadata in zip(ids, data["texts"], data["embeddings"], data["metadatas"]): + embed_func = EmbeddingsFunAdapter(self.embed_model) + texts = [doc.page_content for doc in docs] + metadatas = [doc.metadata for doc in docs] + embeddings = embed_func.embed_documents(texts=texts) + ids = [str(uuid.uuid1()) for _ in range(len(texts))] + for _id, text, embedding, metadata in zip(ids, texts, embeddings, metadatas): self.collection.add(ids=_id, embeddings=embedding, metadatas=metadata, documents=text) doc_infos.append({"id": _id, "metadata": metadata}) return doc_infos