mirror of
https://github.com/RYDE-WORK/Langchain-Chatchat.git
synced 2026-01-19 13:23:16 +08:00
新功能:
- 支持在线 Embeddings:zhipu-api, qwen-api, minimax-api, qianfan-api
- API 增加 /other/embed_texts 接口
- init_database.py 增加 --embed-model 参数,可以指定使用的嵌入模型(本地或在线均可)
- 对于 FAISS 知识库,支持多向量库,默认位置:{KB_PATH}/vector_store/{embed_model}
- Lite 模式支持所有知识库相关功能。此模式下最主要的限制是:
- 不能使用本地 LLM 和 Embeddings 模型
- 知识库不支持 PDF 文件
- init_database.py 重建知识库时不再默认情况数据库表,增加 clear-tables 参数手动控制。
- API 和 WEBUI 中 score_threshold 参数范围改为 [0, 2],以更好的适应在线嵌入模型
问题修复:
- API 中 list_config_models 会删除 ONLINE_LLM_MODEL 中的敏感信息,导致第二轮API请求错误
开发者:
- 统一向量库的识别:以(kb_name,embed_model)为判断向量库唯一性的依据,避免 FAISS 知识库缓存加载逻辑错误
- KBServiceFactory.get_service_by_name 中添加 default_embed_model 参数,用于在构建新知识库时设置 embed_model
- 优化 kb_service 中 Embeddings 操作:
- 统一加载接口: server.utils.load_embeddings,利用全局缓存避免各处 Embeddings 传参
- 统一文本嵌入接口:server.knowledge_base.kb_service.base.[embed_texts, embed_documents]
- 重写 normalize 函数,去除对 scikit-learn/scipy 的依赖
111 lines
3.5 KiB
Python
111 lines
3.5 KiB
Python
import sys
|
|
sys.path.append(".")
|
|
from server.knowledge_base.migrate import create_tables, reset_tables, folder2db, prune_db_docs, prune_folder_files
|
|
from configs.model_config import NLTK_DATA_PATH, EMBEDDING_MODEL
|
|
import nltk
|
|
nltk.data.path = [NLTK_DATA_PATH] + nltk.data.path
|
|
from datetime import datetime
|
|
import sys
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(description="please specify only one operate method once time.")
|
|
|
|
parser.add_argument(
|
|
"-r",
|
|
"--recreate-vs",
|
|
action="store_true",
|
|
help=('''
|
|
recreate vector store.
|
|
use this option if you have copied document files to the content folder, but vector store has not been populated or DEFAUL_VS_TYPE/EMBEDDING_MODEL changed.
|
|
'''
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"--clear-tables",
|
|
action="store_true",
|
|
help=("drop the database tables before recreate vector stores")
|
|
)
|
|
parser.add_argument(
|
|
"-u",
|
|
"--update-in-db",
|
|
action="store_true",
|
|
help=('''
|
|
update vector store for files exist in database.
|
|
use this option if you want to recreate vectors for files exist in db and skip files exist in local folder only.
|
|
'''
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"-i",
|
|
"--increament",
|
|
action="store_true",
|
|
help=('''
|
|
update vector store for files exist in local folder and not exist in database.
|
|
use this option if you want to create vectors increamentally.
|
|
'''
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"--prune-db",
|
|
action="store_true",
|
|
help=('''
|
|
delete docs in database that not existed in local folder.
|
|
it is used to delete database docs after user deleted some doc files in file browser
|
|
'''
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"--prune-folder",
|
|
action="store_true",
|
|
help=('''
|
|
delete doc files in local folder that not existed in database.
|
|
is is used to free local disk space by delete unused doc files.
|
|
'''
|
|
)
|
|
)
|
|
parser.add_argument(
|
|
"-n",
|
|
"--kb-name",
|
|
type=str,
|
|
nargs="+",
|
|
default=[],
|
|
help=("specify knowledge base names to operate on. default is all folders exist in KB_ROOT_PATH.")
|
|
)
|
|
parser.add_argument(
|
|
"-e",
|
|
"--embed-model",
|
|
type=str,
|
|
default=EMBEDDING_MODEL,
|
|
help=("specify embeddings model.")
|
|
)
|
|
|
|
if len(sys.argv) <= 1:
|
|
parser.print_help()
|
|
else:
|
|
args = parser.parse_args()
|
|
start_time = datetime.now()
|
|
|
|
create_tables() # confirm tables exist
|
|
|
|
if args.clear_tables:
|
|
reset_tables()
|
|
print("database talbes reseted")
|
|
|
|
if args.recreate_vs:
|
|
print("recreating all vector stores")
|
|
folder2db(kb_names=args.kb_name, mode="recreate_vs", embed_model=args.embed_model)
|
|
elif args.update_in_db:
|
|
folder2db(kb_names=args.kb_name, mode="update_in_db", embed_model=args.embed_model)
|
|
elif args.increament:
|
|
folder2db(kb_names=args.kb_name, mode="increament", embed_model=args.embed_model)
|
|
elif args.prune_db:
|
|
prune_db_docs(args.kb_name)
|
|
elif args.prune_folder:
|
|
prune_folder_files(args.kb_name)
|
|
|
|
end_time = datetime.now()
|
|
print(f"总计用时: {end_time-start_time}")
|