mirror of
https://github.com/aimingmed/aimingmed-ai.git
synced 2026-02-09 08:53:46 +08:00
revision for 3 questions testing
This commit is contained in:
parent
afbb34079a
commit
6471626497
1
.gitignore
vendored
1
.gitignore
vendored
@ -209,3 +209,4 @@ data/*
|
|||||||
**/llm-examples/*
|
**/llm-examples/*
|
||||||
**/*.ipynb_checkpoints
|
**/*.ipynb_checkpoints
|
||||||
**/*.ipynb
|
**/*.ipynb
|
||||||
|
**/transformer_model/*
|
||||||
@ -15,10 +15,10 @@ rag:
|
|||||||
testing:
|
testing:
|
||||||
query: "如何治疗乳腺癌?"
|
query: "如何治疗乳腺癌?"
|
||||||
evaluation:
|
evaluation:
|
||||||
evaluation_dataset_csv_path: "../../../../data/qa_dataset_01.csv"
|
evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv"
|
||||||
evaluation_dataset_column_question: question
|
evaluation_dataset_column_question: question
|
||||||
evaluation_dataset_column_answer: answer
|
evaluation_dataset_column_answer: answer
|
||||||
ls_chat_model_provider:
|
ls_chat_model_provider:
|
||||||
- gemini
|
- gemini
|
||||||
- deepseek
|
- deepseek
|
||||||
- moonshot
|
- moonshot
|
||||||
|
|||||||
@ -105,7 +105,7 @@ def go(args):
|
|||||||
documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0]
|
documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0]
|
||||||
|
|
||||||
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
||||||
chunk_size=1000, chunk_overlap=500
|
chunk_size=15000, chunk_overlap=7500
|
||||||
)
|
)
|
||||||
|
|
||||||
ls_docs = []
|
ls_docs = []
|
||||||
@ -113,7 +113,7 @@ def go(args):
|
|||||||
for file in files:
|
for file in files:
|
||||||
if file.endswith(".pdf"):
|
if file.endswith(".pdf"):
|
||||||
read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
|
read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
|
||||||
document = Document(metadata={"file": file}, page_content=read_text)
|
document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text)
|
||||||
ls_docs.append(document)
|
ls_docs.append(document)
|
||||||
|
|
||||||
doc_splits = text_splitter.split_documents(ls_docs)
|
doc_splits = text_splitter.split_documents(ls_docs)
|
||||||
@ -138,7 +138,7 @@ def go(args):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="A very basic data cleaning")
|
parser = argparse.ArgumentParser(description="ETL for ChromaDB with readable PDF")
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--input_artifact",
|
"--input_artifact",
|
||||||
|
|||||||
@ -5,3 +5,5 @@ EMBEDDING_MODEL = config("EMBEDDING_MODEL", cast=str, default="paraphrase-multil
|
|||||||
|
|
||||||
# Initialize embedding model
|
# Initialize embedding model
|
||||||
model = SentenceTransformer(EMBEDDING_MODEL)
|
model = SentenceTransformer(EMBEDDING_MODEL)
|
||||||
|
|
||||||
|
model.save("./transformer_model/paraphrase-multilingual-mpnet-base-v2")
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user