revision for 3 questions testing

This commit is contained in:
leehk 2025-03-24 13:35:40 +08:00
parent afbb34079a
commit 6471626497
4 changed files with 12 additions and 9 deletions

3
.gitignore vendored
View File

@ -208,4 +208,5 @@ data/*
**/*.zip **/*.zip
**/llm-examples/* **/llm-examples/*
**/*.ipynb_checkpoints **/*.ipynb_checkpoints
**/*.ipynb **/*.ipynb
**/transformer_model/*

View File

@ -15,10 +15,10 @@ rag:
testing: testing:
query: "如何治疗乳腺癌?" query: "如何治疗乳腺癌?"
evaluation: evaluation:
evaluation_dataset_csv_path: "../../../../data/qa_dataset_01.csv" evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv"
evaluation_dataset_column_question: question evaluation_dataset_column_question: question
evaluation_dataset_column_answer: answer evaluation_dataset_column_answer: answer
ls_chat_model_provider: ls_chat_model_provider:
- gemini - gemini
- deepseek - deepseek
- moonshot - moonshot

View File

@ -105,7 +105,7 @@ def go(args):
documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0] documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0]
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=1000, chunk_overlap=500 chunk_size=15000, chunk_overlap=7500
) )
ls_docs = [] ls_docs = []
@ -113,7 +113,7 @@ def go(args):
for file in files: for file in files:
if file.endswith(".pdf"): if file.endswith(".pdf"):
read_text = extract_chinese_text_from_pdf(os.path.join(root, file)) read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
document = Document(metadata={"file": file}, page_content=read_text) document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text)
ls_docs.append(document) ls_docs.append(document)
doc_splits = text_splitter.split_documents(ls_docs) doc_splits = text_splitter.split_documents(ls_docs)
@ -138,7 +138,7 @@ def go(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser(description="A very basic data cleaning") parser = argparse.ArgumentParser(description="ETL for ChromaDB with readable PDF")
parser.add_argument( parser.add_argument(
"--input_artifact", "--input_artifact",

View File

@ -4,4 +4,6 @@ from sentence_transformers import SentenceTransformer
EMBEDDING_MODEL = config("EMBEDDING_MODEL", cast=str, default="paraphrase-multilingual-mpnet-base-v2") EMBEDDING_MODEL = config("EMBEDDING_MODEL", cast=str, default="paraphrase-multilingual-mpnet-base-v2")
# Initialize embedding model # Initialize embedding model
model = SentenceTransformer(EMBEDDING_MODEL) model = SentenceTransformer(EMBEDDING_MODEL)
model.save("./transformer_model/paraphrase-multilingual-mpnet-base-v2")