2024-03-06 13:44:43 +08:00

100 lines
3.7 KiB
Python

from pydantic.v1 import BaseModel, Field
from langchain.utilities.bing_search import BingSearchAPIWrapper
from langchain.utilities.duckduckgo_search import DuckDuckGoSearchAPIWrapper
from configs import TOOL_CONFIG
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List, Dict
from langchain.docstore.document import Document
from strsimpy.normalized_levenshtein import NormalizedLevenshtein
from markdownify import markdownify
def bing_search(text, config):
search = BingSearchAPIWrapper(bing_subscription_key=config["bing_key"],
bing_search_url=config["bing_search_url"])
return search.results(text, config["result_len"])
def duckduckgo_search(text, config):
search = DuckDuckGoSearchAPIWrapper()
return search.results(text, config["result_len"])
def metaphor_search(
text: str,
config: dict,
) -> List[Dict]:
from metaphor_python import Metaphor
client = Metaphor(config["metaphor_api_key"])
search = client.search(text, num_results=config["result_len"], use_autoprompt=True)
contents = search.get_contents().contents
for x in contents:
x.extract = markdownify(x.extract)
if config["split_result"]:
docs = [Document(page_content=x.extract,
metadata={"link": x.url, "title": x.title})
for x in contents]
text_splitter = RecursiveCharacterTextSplitter(["\n\n", "\n", ".", " "],
chunk_size=config["chunk_size"],
chunk_overlap=config["chunk_overlap"])
splitted_docs = text_splitter.split_documents(docs)
if len(splitted_docs) > config["result_len"]:
normal = NormalizedLevenshtein()
for x in splitted_docs:
x.metadata["score"] = normal.similarity(text, x.page_content)
splitted_docs.sort(key=lambda x: x.metadata["score"], reverse=True)
splitted_docs = splitted_docs[:config["result_len"]]
docs = [{"snippet": x.page_content,
"link": x.metadata["link"],
"title": x.metadata["title"]}
for x in splitted_docs]
else:
docs = [{"snippet": x.extract,
"link": x.url,
"title": x.title}
for x in contents]
return docs
SEARCH_ENGINES = {"bing": bing_search,
"duckduckgo": duckduckgo_search,
"metaphor": metaphor_search,
}
def search_result2docs(search_results):
docs = []
for result in search_results:
doc = Document(page_content=result["snippet"] if "snippet" in result.keys() else "",
metadata={"source": result["link"] if "link" in result.keys() else "",
"filename": result["title"] if "title" in result.keys() else ""})
docs.append(doc)
return docs
def search_engine(query: str,
config: dict):
search_engine_use = SEARCH_ENGINES[config["search_engine_name"]]
results = search_engine_use(text=query,
config=config["search_engine_config"][
config["search_engine_name"]])
docs = search_result2docs(results)
context = ""
docs = [
f"""出处 [{inum + 1}] [{doc.metadata["source"]}]({doc.metadata["source"]}) \n\n{doc.page_content}\n\n"""
for inum, doc in enumerate(docs)
]
for doc in docs:
context += doc + "\n"
return context
def search_internet(query: str):
tool_config = TOOL_CONFIG["search_internet"]
return search_engine(query=query, config=tool_config)
class SearchInternetInput(BaseModel):
query: str = Field(description="query for Internet search")