Fix(File RAG): use jieba instead of cutword

This commit is contained in:
imClumsyPanda 2024-06-10 16:33:13 +08:00
parent d71c9b0a27
commit 1987063a76
2 changed files with 6 additions and 4 deletions

View File

@ -30,12 +30,13 @@ class EnsembleRetrieverService(BaseRetrieverService):
}
)
# TODO: 换个不用torch的实现方式
from cutword.cutword import Cutter
cutter = Cutter()
# from cutword.cutword import Cutter
import jieba
# cutter = Cutter()
docs = list(vectorstore.docstore._dict.values())
bm25_retriever = BM25Retriever.from_documents(
docs,
preprocess_func=cutter.cutword
preprocess_func=jieba.lcut_for_search,
)
bm25_retriever.k = top_k
ensemble_retriever = EnsembleRetriever(

View File

@ -29,7 +29,8 @@ unstructured = "~0.11.0"
python-magic-bin = {version = "*", platform = "win32"}
SQLAlchemy = "~2.0.25"
faiss-cpu = "~1.7.4"
cutword = "0.1.0"
#cutword = "0.1.0"
jieba = "0.42.1"
rank_bm25 = "0.2.2"
# accelerate = "~0.24.1"
# spacy = "~3.7.2"