mirror of
https://github.com/RYDE-WORK/Langchain-Chatchat.git
synced 2026-02-09 00:25:46 +08:00
support for bge-large-zh-noinstruct and openai text-embedding-ada-002 (#1119)
* support for bge-large-zh-noinstruct and openai text-embedding-ada-002 * 完善了readme,修改了BGE模型的载入方式 * 补充了readme中bge-large-zh-noinstruct的信息 * Update faiss_kb_service.py * Update utils.py --------- Co-authored-by: zR <zRzRzRzRzRzRzR> Co-authored-by: imClumsyPanda <littlepanda0716@gmail.com>
This commit is contained in:
parent
4bcc74d214
commit
7291e77978
@ -126,6 +126,7 @@ docker run -d --gpus all -p 80:8501 registry.cn-beijing.aliyuncs.com/chatchat/ch
|
|||||||
- [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh)
|
- [BAAI/bge-small-zh](https://huggingface.co/BAAI/bge-small-zh)
|
||||||
- [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh)
|
- [BAAI/bge-base-zh](https://huggingface.co/BAAI/bge-base-zh)
|
||||||
- [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh)
|
- [BAAI/bge-large-zh](https://huggingface.co/BAAI/bge-large-zh)
|
||||||
|
- [BAAI/bge-large-zh-noinstruct](https://huggingface.co/BAAI/bge-large-zh-noinstruct)
|
||||||
- [text2vec-base-chinese-sentence](https://huggingface.co/shibing624/text2vec-base-chinese-sentence)
|
- [text2vec-base-chinese-sentence](https://huggingface.co/shibing624/text2vec-base-chinese-sentence)
|
||||||
- [text2vec-base-chinese-paraphrase](https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase)
|
- [text2vec-base-chinese-paraphrase](https://huggingface.co/shibing624/text2vec-base-chinese-paraphrase)
|
||||||
- [text2vec-base-multilingual](https://huggingface.co/shibing624/text2vec-base-multilingual)
|
- [text2vec-base-multilingual](https://huggingface.co/shibing624/text2vec-base-multilingual)
|
||||||
@ -133,6 +134,7 @@ docker run -d --gpus all -p 80:8501 registry.cn-beijing.aliyuncs.com/chatchat/ch
|
|||||||
- [GanymedeNil/text2vec-large-chinese](https://huggingface.co/GanymedeNil/text2vec-large-chinese)
|
- [GanymedeNil/text2vec-large-chinese](https://huggingface.co/GanymedeNil/text2vec-large-chinese)
|
||||||
- [nghuyong/ernie-3.0-nano-zh](https://huggingface.co/nghuyong/ernie-3.0-nano-zh)
|
- [nghuyong/ernie-3.0-nano-zh](https://huggingface.co/nghuyong/ernie-3.0-nano-zh)
|
||||||
- [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh)
|
- [nghuyong/ernie-3.0-base-zh](https://huggingface.co/nghuyong/ernie-3.0-base-zh)
|
||||||
|
- [OpenAI/text-embedding-ada-002](https://platform.openai.com/docs/guides/embeddings)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -206,6 +208,7 @@ embedding_model_dict = {
|
|||||||
"m3e-base": "/Users/xxx/Downloads/m3e-base",
|
"m3e-base": "/Users/xxx/Downloads/m3e-base",
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
如果你选择使用OpenAI的Embedding模型,请将模型的```key```写入`embedding_model_dict`中。使用该模型,你需要鞥能够访问OpenAI官的API,或设置代理。
|
||||||
|
|
||||||
### 4. 知识库初始化与迁移
|
### 4. 知识库初始化与迁移
|
||||||
|
|
||||||
|
|||||||
@ -24,7 +24,9 @@ embedding_model_dict = {
|
|||||||
"m3e-large": "moka-ai/m3e-large",
|
"m3e-large": "moka-ai/m3e-large",
|
||||||
"bge-small-zh": "BAAI/bge-small-zh",
|
"bge-small-zh": "BAAI/bge-small-zh",
|
||||||
"bge-base-zh": "BAAI/bge-base-zh",
|
"bge-base-zh": "BAAI/bge-base-zh",
|
||||||
"bge-large-zh": "BAAI/bge-large-zh"
|
"bge-large-zh": "BAAI/bge-large-zh",
|
||||||
|
"bge-large-zh-noinstruct": "BAAI/bge-large-zh-noinstruct",
|
||||||
|
"text-embedding-ada-002": os.environ.get("OPENAI_API_KEY")
|
||||||
}
|
}
|
||||||
|
|
||||||
# 选用的 Embedding 名称
|
# 选用的 Embedding 名称
|
||||||
|
|||||||
@ -13,7 +13,8 @@ from functools import lru_cache
|
|||||||
from server.knowledge_base.utils import get_vs_path, load_embeddings, KnowledgeFile
|
from server.knowledge_base.utils import get_vs_path, load_embeddings, KnowledgeFile
|
||||||
from langchain.vectorstores import FAISS
|
from langchain.vectorstores import FAISS
|
||||||
from langchain.embeddings.base import Embeddings
|
from langchain.embeddings.base import Embeddings
|
||||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings,HuggingFaceBgeEmbeddings
|
||||||
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||||
from typing import List
|
from typing import List
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from server.utils import torch_gc
|
from server.utils import torch_gc
|
||||||
@ -21,10 +22,19 @@ from server.utils import torch_gc
|
|||||||
|
|
||||||
# make HuggingFaceEmbeddings hashable
|
# make HuggingFaceEmbeddings hashable
|
||||||
def _embeddings_hash(self):
|
def _embeddings_hash(self):
|
||||||
|
if isinstance(self, HuggingFaceEmbeddings):
|
||||||
return hash(self.model_name)
|
return hash(self.model_name)
|
||||||
|
elif isinstance(self, HuggingFaceBgeEmbeddings):
|
||||||
|
return hash(self.model_name)
|
||||||
|
elif isinstance(self, OpenAIEmbeddings):
|
||||||
|
return hash(self.model)
|
||||||
|
|
||||||
HuggingFaceEmbeddings.__hash__ = _embeddings_hash
|
HuggingFaceEmbeddings.__hash__ = _embeddings_hash
|
||||||
|
OpenAIEmbeddings.__hash__ = _embeddings_hash
|
||||||
|
HuggingFaceBgeEmbeddings.__hash__ = _embeddings_hash
|
||||||
|
|
||||||
|
_VECTOR_STORE_TICKS = {}
|
||||||
|
|
||||||
|
|
||||||
_VECTOR_STORE_TICKS = {}
|
_VECTOR_STORE_TICKS = {}
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
|
||||||
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||||
|
from langchain.embeddings import HuggingFaceBgeEmbeddings
|
||||||
from configs.model_config import (
|
from configs.model_config import (
|
||||||
embedding_model_dict,
|
embedding_model_dict,
|
||||||
KB_ROOT_PATH,
|
KB_ROOT_PATH,
|
||||||
@ -41,11 +43,20 @@ def list_docs_from_folder(kb_name: str):
|
|||||||
|
|
||||||
@lru_cache(1)
|
@lru_cache(1)
|
||||||
def load_embeddings(model: str, device: str):
|
def load_embeddings(model: str, device: str):
|
||||||
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[model],
|
if model == "text-embedding-ada-002": # openai text-embedding-ada-002
|
||||||
model_kwargs={'device': device})
|
embeddings = OpenAIEmbeddings(openai_api_key=embedding_model_dict[model], chunk_size=CHUNK_SIZE)
|
||||||
|
elif 'bge-' in model:
|
||||||
|
embeddings = HuggingFaceBgeEmbeddings(model_name=embedding_model_dict[model],
|
||||||
|
model_kwargs={'device': device},
|
||||||
|
query_instruction="为这个句子生成表示以用于检索相关文章:")
|
||||||
|
if model == "bge-large-zh-noinstruct": # bge large -noinstruct embedding
|
||||||
|
embeddings.query_instruction = ""
|
||||||
|
else:
|
||||||
|
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[model], model_kwargs={'device': device})
|
||||||
return embeddings
|
return embeddings
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
LOADER_DICT = {"UnstructuredFileLoader": ['.eml', '.html', '.json', '.md', '.msg', '.rst',
|
LOADER_DICT = {"UnstructuredFileLoader": ['.eml', '.html', '.json', '.md', '.msg', '.rst',
|
||||||
'.rtf', '.txt', '.xml',
|
'.rtf', '.txt', '.xml',
|
||||||
'.doc', '.docx', '.epub', '.odt', '.pdf',
|
'.doc', '.docx', '.epub', '.odt', '.pdf',
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user