Done with text replacement

This commit is contained in:
leehk 2025-04-01 15:07:51 +08:00
parent 465c24546d
commit 1df99f3767
3 changed files with 30 additions and 7 deletions

2
.gitignore vendored
View File

@ -202,6 +202,8 @@ data/*
**/.config.py **/.config.py
**/chroma_db/* **/chroma_db/*
**/*.pdf **/*.pdf
**/documents/**/*.json
**/documents/**/*.xlsx
**/.env **/.env
**/llm-template2/* **/llm-template2/*
**/llmops/outputs/* **/llmops/outputs/*

View File

@ -15,7 +15,7 @@ rag:
testing: testing:
query: "如何治疗乳腺癌?" query: "如何治疗乳腺癌?"
evaluation: evaluation:
evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv" evaluation_dataset_csv_path: "../../../../data/qa_dataset_20250401b.csv"
evaluation_dataset_column_question: question evaluation_dataset_column_question: question
evaluation_dataset_column_answer: answer evaluation_dataset_column_answer: answer
ls_chat_model_provider: ls_chat_model_provider:

View File

@ -3,6 +3,8 @@
Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact
""" """
import argparse import argparse
import glob
import json
import logging import logging
import os import os
import mlflow import mlflow
@ -108,13 +110,32 @@ def go(args):
chunk_size=15000, chunk_overlap=7500 chunk_size=15000, chunk_overlap=7500
) )
# read the dictionary json for word replacement in the read text
with open(f'./{documents_folder}/2023CACA/CACA英文缩写.json', 'r', encoding='utf-8') as f:
df_dict_json = json.load(f)
ls_docs = [] ls_docs = []
for root, _dir, files in os.walk(f"./{documents_folder}"): pdf_files = glob.glob(f"./{documents_folder}/**/*.pdf", recursive=True)
for file in files:
if file.endswith(".pdf"): for pdf_file in pdf_files:
read_text = extract_chinese_text_from_pdf(os.path.join(root, file)) read_text = extract_chinese_text_from_pdf(pdf_file)
document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text) relative_path = os.path.relpath(pdf_file, start=f"./{documents_folder}")
ls_docs.append(document)
# if the parent directory of the pdf file is 2023CACA, then replace the shortform text with the dictionary value
if '2023CACA' in relative_path:
# get the pdf filename without the extension
pdf_filename = os.path.splitext(os.path.basename(pdf_file))[0]
# replace the text with the dictionary
dict_file = df_dict_json.get(pdf_filename)
if dict_file:
for key, value in dict_file.items():
read_text = read_text.replace(key, value)
document = Document(metadata={"file": relative_path}, page_content=read_text)
ls_docs.append(document)
doc_splits = text_splitter.split_documents(ls_docs) doc_splits = text_splitter.split_documents(ls_docs)