Fix(File RAG): use jieba instead of cutword

This commit is contained in:
imClumsyPanda 2024-06-10 16:33:13 +08:00
parent d71c9b0a27
commit 1987063a76
2 changed files with 6 additions and 4 deletions

View File

@ -30,12 +30,13 @@ class EnsembleRetrieverService(BaseRetrieverService):
} }
) )
# TODO: 换个不用torch的实现方式 # TODO: 换个不用torch的实现方式
from cutword.cutword import Cutter # from cutword.cutword import Cutter
cutter = Cutter() import jieba
# cutter = Cutter()
docs = list(vectorstore.docstore._dict.values()) docs = list(vectorstore.docstore._dict.values())
bm25_retriever = BM25Retriever.from_documents( bm25_retriever = BM25Retriever.from_documents(
docs, docs,
preprocess_func=cutter.cutword preprocess_func=jieba.lcut_for_search,
) )
bm25_retriever.k = top_k bm25_retriever.k = top_k
ensemble_retriever = EnsembleRetriever( ensemble_retriever = EnsembleRetriever(

View File

@ -29,7 +29,8 @@ unstructured = "~0.11.0"
python-magic-bin = {version = "*", platform = "win32"} python-magic-bin = {version = "*", platform = "win32"}
SQLAlchemy = "~2.0.25" SQLAlchemy = "~2.0.25"
faiss-cpu = "~1.7.4" faiss-cpu = "~1.7.4"
cutword = "0.1.0" #cutword = "0.1.0"
jieba = "0.42.1"
rank_bm25 = "0.2.2" rank_bm25 = "0.2.2"
# accelerate = "~0.24.1" # accelerate = "~0.24.1"
# spacy = "~3.7.2" # spacy = "~3.7.2"