From 1987063a76a14ef5cfa745baaa51cb8f64c504ec Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Mon, 10 Jun 2024 16:33:13 +0800 Subject: [PATCH] Fix(File RAG): use jieba instead of cutword --- .../chatchat/server/file_rag/retrievers/ensemble.py | 7 ++++--- libs/chatchat-server/pyproject.toml | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/libs/chatchat-server/chatchat/server/file_rag/retrievers/ensemble.py b/libs/chatchat-server/chatchat/server/file_rag/retrievers/ensemble.py index cb09b633..5d6b17a6 100644 --- a/libs/chatchat-server/chatchat/server/file_rag/retrievers/ensemble.py +++ b/libs/chatchat-server/chatchat/server/file_rag/retrievers/ensemble.py @@ -30,12 +30,13 @@ class EnsembleRetrieverService(BaseRetrieverService): } ) # TODO: 换个不用torch的实现方式 - from cutword.cutword import Cutter - cutter = Cutter() + # from cutword.cutword import Cutter + import jieba + # cutter = Cutter() docs = list(vectorstore.docstore._dict.values()) bm25_retriever = BM25Retriever.from_documents( docs, - preprocess_func=cutter.cutword + preprocess_func=jieba.lcut_for_search, ) bm25_retriever.k = top_k ensemble_retriever = EnsembleRetriever( diff --git a/libs/chatchat-server/pyproject.toml b/libs/chatchat-server/pyproject.toml index 136b8b1b..85646274 100644 --- a/libs/chatchat-server/pyproject.toml +++ b/libs/chatchat-server/pyproject.toml @@ -29,7 +29,8 @@ unstructured = "~0.11.0" python-magic-bin = {version = "*", platform = "win32"} SQLAlchemy = "~2.0.25" faiss-cpu = "~1.7.4" -cutword = "0.1.0" +#cutword = "0.1.0" +jieba = "0.42.1" rank_bm25 = "0.2.2" # accelerate = "~0.24.1" # spacy = "~3.7.2"