From f95d41ef471707ddc0f0a2430c1cbc9faf2dfdcc Mon Sep 17 00:00:00 2001 From: imClumsyPanda Date: Sat, 26 Aug 2023 11:45:01 +0800 Subject: [PATCH] =?UTF-8?q?[BUG]=20=E4=BF=AE=E5=A4=8Dcsv=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E8=AF=BB=E5=8F=96=E5=90=8E=EF=BC=8C=E5=8D=95=E8=A1=8C=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E8=A2=AB=E5=88=86=E6=88=90=E5=A4=9A=E6=AE=B5=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/knowledge_base/utils.py | 43 ++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index da530495..34f20832 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -104,32 +104,35 @@ class KnowledgeFile: else: loader = DocumentLoader(self.filepath) - try: - if self.text_splitter_name is None: + if self.ext in ".csv": + docs = loader.load() + else: + try: + if self.text_splitter_name is None: + text_splitter_module = importlib.import_module('langchain.text_splitter') + TextSplitter = getattr(text_splitter_module, "SpacyTextSplitter") + text_splitter = TextSplitter( + pipeline="zh_core_web_sm", + chunk_size=CHUNK_SIZE, + chunk_overlap=OVERLAP_SIZE, + ) + self.text_splitter_name = "SpacyTextSplitter" + else: + text_splitter_module = importlib.import_module('langchain.text_splitter') + TextSplitter = getattr(text_splitter_module, self.text_splitter_name) + text_splitter = TextSplitter( + chunk_size=CHUNK_SIZE, + chunk_overlap=OVERLAP_SIZE) + except Exception as e: + print(e) text_splitter_module = importlib.import_module('langchain.text_splitter') - TextSplitter = getattr(text_splitter_module, "SpacyTextSplitter") + TextSplitter = getattr(text_splitter_module, "RecursiveCharacterTextSplitter") text_splitter = TextSplitter( - pipeline="zh_core_web_sm", chunk_size=CHUNK_SIZE, chunk_overlap=OVERLAP_SIZE, ) - self.text_splitter_name = "SpacyTextSplitter" - else: - text_splitter_module = importlib.import_module('langchain.text_splitter') - TextSplitter = getattr(text_splitter_module, self.text_splitter_name) - text_splitter = TextSplitter( - chunk_size=CHUNK_SIZE, - chunk_overlap=OVERLAP_SIZE) - except Exception as e: - print(e) - text_splitter_module = importlib.import_module('langchain.text_splitter') - TextSplitter = getattr(text_splitter_module, "RecursiveCharacterTextSplitter") - text_splitter = TextSplitter( - chunk_size=CHUNK_SIZE, - chunk_overlap=OVERLAP_SIZE, - ) - docs = loader.load_and_split(text_splitter) + docs = loader.load_and_split(text_splitter) print(docs[0]) if using_zh_title_enhance: docs = zh_title_enhance(docs)