From 3ff130ecc2f7fb7a1b4e4cd8e963fbe56c1827c7 Mon Sep 17 00:00:00 2001 From: peterz3g Date: Thu, 21 Sep 2023 14:41:49 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dcsv=E6=96=87=E4=BB=B6?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=E7=BC=96=E7=A0=81=E6=8A=A5=E9=94=99=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98=E3=80=82=20(#1508)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/knowledge_base/utils.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index e34c3262..8eeb5049 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -26,6 +26,7 @@ from concurrent.futures import ThreadPoolExecutor from server.utils import run_in_thread_pool, embedding_device import io from typing import List, Union, Callable, Dict, Optional, Tuple, Generator +import chardet def validate_kb_name(knowledge_base_id: str) -> bool: @@ -167,7 +168,14 @@ def get_loader(loader_name: str, file_path_or_content: Union[str, bytes, io.Stri if loader_name == "UnstructuredFileLoader": loader = DocumentLoader(file_path_or_content, autodetect_encoding=True) elif loader_name == "CSVLoader": - loader = DocumentLoader(file_path_or_content, encoding="utf-8") + # 自动识别文件编码类型,避免langchain loader 加载文件报编码错误 + with open(file_path_or_content, 'rb') as struct_file: + encode_detect = chardet.detect(struct_file.read()) + if encode_detect: + loader = DocumentLoader(file_path_or_content, encoding=encode_detect["encoding"]) + else: + loader = DocumentLoader(file_path_or_content, encoding="utf-8") + elif loader_name == "JSONLoader": loader = DocumentLoader(file_path_or_content, jq_schema=".", text_content=False) elif loader_name == "CustomJSONLoader":