diff --git a/README.md b/README.md index e69818c3..2e97dc65 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ $ python startup.py -a [![Telegram](https://img.shields.io/badge/Telegram-2CA5E0?style=for-the-badge&logo=telegram&logoColor=white "langchain-chatglm")](https://t.me/+RjliQ3jnJ1YyN2E9) ### 项目交流群 -二维码 +二维码 🎉 Langchain-Chatchat 项目微信交流群,如果你也对本项目感兴趣,欢迎加入群聊参与讨论交流。 diff --git a/img/qr_code_90.jpg b/img/qr_code_90.jpg new file mode 100644 index 00000000..174d150f Binary files /dev/null and b/img/qr_code_90.jpg differ diff --git a/img/qr_code_91.jpg b/img/qr_code_91.jpg new file mode 100644 index 00000000..54107922 Binary files /dev/null and b/img/qr_code_91.jpg differ diff --git a/img/qr_code_92.jpg b/img/qr_code_92.jpg new file mode 100644 index 00000000..4010c44a Binary files /dev/null and b/img/qr_code_92.jpg differ diff --git a/img/qr_code_93.jpg b/img/qr_code_93.jpg new file mode 100644 index 00000000..d83ac08d Binary files /dev/null and b/img/qr_code_93.jpg differ diff --git a/img/qr_code_94.jpg b/img/qr_code_94.jpg new file mode 100644 index 00000000..00e14b81 Binary files /dev/null and b/img/qr_code_94.jpg differ diff --git a/img/qr_code_95.jpg b/img/qr_code_95.jpg new file mode 100644 index 00000000..11742e96 Binary files /dev/null and b/img/qr_code_95.jpg differ diff --git a/img/qr_code_96.jpg b/img/qr_code_96.jpg new file mode 100644 index 00000000..cc0777df Binary files /dev/null and b/img/qr_code_96.jpg differ diff --git a/img/qrcode_90_2.jpg b/img/qrcode_90_2.jpg new file mode 100644 index 00000000..1666a074 Binary files /dev/null and b/img/qrcode_90_2.jpg differ diff --git a/server/knowledge_base/kb_doc_api.py b/server/knowledge_base/kb_doc_api.py index b5246da0..8b200254 100644 --- a/server/knowledge_base/kb_doc_api.py +++ b/server/knowledge_base/kb_doc_api.py @@ -38,6 +38,9 @@ def search_docs( data = [DocumentWithVSId(**x[0].dict(), score=x[1], id=x[0].metadata.get("id")) for x in docs] elif file_name or metadata: data = kb.list_docs(file_name=file_name, metadata=metadata) + for d in data: + if "vector" in d.metadata: + del d.metadata["vector"] return data diff --git a/server/knowledge_base/utils.py b/server/knowledge_base/utils.py index fc7edac3..0d1e450f 100644 --- a/server/knowledge_base/utils.py +++ b/server/knowledge_base/utils.py @@ -16,7 +16,7 @@ import langchain_community.document_loaders from langchain.docstore.document import Document from langchain.text_splitter import TextSplitter from pathlib import Path -from server.utils import run_in_thread_pool +from server.utils import run_in_thread_pool, run_in_process_pool import json from typing import List, Union, Dict, Tuple, Generator import chardet @@ -353,6 +353,16 @@ class KnowledgeFile: return os.path.getsize(self.filepath) +def files2docs_in_thread_file2docs(*, file: KnowledgeFile, **kwargs) -> Tuple[bool, Tuple[str, str, List[Document]]]: + try: + return True, (file.kb_name, file.filename, file.file2text(**kwargs)) + except Exception as e: + msg = f"从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}" + logger.error(f'{e.__class__.__name__}: {msg}', + exc_info=e if log_verbose else None) + return False, (file.kb_name, file.filename, msg) + + def files2docs_in_thread( files: List[Union[KnowledgeFile, Tuple[str, str], Dict]], chunk_size: int = CHUNK_SIZE, @@ -365,14 +375,6 @@ def files2docs_in_thread( 生成器返回值为 status, (kb_name, file_name, docs | error) ''' - def file2docs(*, file: KnowledgeFile, **kwargs) -> Tuple[bool, Tuple[str, str, List[Document]]]: - try: - return True, (file.kb_name, file.filename, file.file2text(**kwargs)) - except Exception as e: - msg = f"从文件 {file.kb_name}/{file.filename} 加载文档时出错:{e}" - logger.error(f'{e.__class__.__name__}: {msg}', - exc_info=e if log_verbose else None) - return False, (file.kb_name, file.filename, msg) kwargs_list = [] for i, file in enumerate(files): @@ -395,7 +397,7 @@ def files2docs_in_thread( except Exception as e: yield False, (kb_name, filename, str(e)) - for result in run_in_thread_pool(func=file2docs, params=kwargs_list): + for result in run_in_process_pool(func=files2docs_in_thread_file2docs, params=kwargs_list): yield result diff --git a/server/utils.py b/server/utils.py index c3541504..969a7c6b 100644 --- a/server/utils.py +++ b/server/utils.py @@ -2,8 +2,9 @@ from fastapi import FastAPI from pathlib import Path import asyncio import os -from concurrent.futures import ThreadPoolExecutor, as_completed -from langchain.embeddings.base import Embeddings +import sys +import multiprocessing as mp +from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed from langchain_openai.chat_models import ChatOpenAI from langchain_openai.llms import OpenAI import httpx @@ -572,11 +573,36 @@ def run_in_thread_pool( tasks = [] with ThreadPoolExecutor() as pool: for kwargs in params: - thread = pool.submit(func, **kwargs) - tasks.append(thread) + tasks.append(pool.submit(func, **kwargs)) for obj in as_completed(tasks): - yield obj.result() + try: + yield obj.result() + except Exception as e: + logger.error(f"error in sub thread: {e}", exc_info=True) + + +def run_in_process_pool( + func: Callable, + params: List[Dict] = [], +) -> Generator: + ''' + 在线程池中批量运行任务,并将运行结果以生成器的形式返回。 + 请确保任务中的所有操作是线程安全的,任务函数请全部使用关键字参数。 + ''' + tasks = [] + max_workers = None + if sys.platform.startswith("win"): + max_workers = min(mp.cpu_count(), 60) # max_workers should not exceed 60 on windows + with ProcessPoolExecutor(max_workers=max_workers) as pool: + for kwargs in params: + tasks.append(pool.submit(func, **kwargs)) + + for obj in as_completed(tasks): + try: + yield obj.result() + except Exception as e: + logger.error(f"error in sub process: {e}", exc_info=True) def get_httpx_client(