From 1df99f37676c0b324bddd5122bfa335f7c7819d8 Mon Sep 17 00:00:00 2001 From: leehk Date: Tue, 1 Apr 2025 15:07:51 +0800 Subject: [PATCH] Done with text replacement --- .gitignore | 2 ++ app/llmops/config.yaml | 2 +- app/llmops/src/etl_chromadb_pdf/run.py | 33 +++++++++++++++++++++----- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 8d92113..821af61 100644 --- a/.gitignore +++ b/.gitignore @@ -202,6 +202,8 @@ data/* **/.config.py **/chroma_db/* **/*.pdf +**/documents/**/*.json +**/documents/**/*.xlsx **/.env **/llm-template2/* **/llmops/outputs/* diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml index 834fce8..eaaf910 100644 --- a/app/llmops/config.yaml +++ b/app/llmops/config.yaml @@ -15,7 +15,7 @@ rag: testing: query: "如何治疗乳腺癌?" evaluation: - evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv" + evaluation_dataset_csv_path: "../../../../data/qa_dataset_20250401b.csv" evaluation_dataset_column_question: question evaluation_dataset_column_answer: answer ls_chat_model_provider: diff --git a/app/llmops/src/etl_chromadb_pdf/run.py b/app/llmops/src/etl_chromadb_pdf/run.py index 9b2a82b..23bfbc2 100644 --- a/app/llmops/src/etl_chromadb_pdf/run.py +++ b/app/llmops/src/etl_chromadb_pdf/run.py @@ -3,6 +3,8 @@ Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact """ import argparse +import glob +import json import logging import os import mlflow @@ -108,13 +110,32 @@ def go(args): chunk_size=15000, chunk_overlap=7500 ) + # read the dictionary json for word replacement in the read text + with open(f'./{documents_folder}/2023CACA/CACA英文缩写.json', 'r', encoding='utf-8') as f: + df_dict_json = json.load(f) + ls_docs = [] - for root, _dir, files in os.walk(f"./{documents_folder}"): - for file in files: - if file.endswith(".pdf"): - read_text = extract_chinese_text_from_pdf(os.path.join(root, file)) - document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text) - ls_docs.append(document) + pdf_files = glob.glob(f"./{documents_folder}/**/*.pdf", recursive=True) + + for pdf_file in pdf_files: + read_text = extract_chinese_text_from_pdf(pdf_file) + relative_path = os.path.relpath(pdf_file, start=f"./{documents_folder}") + + # if the parent directory of the pdf file is 2023CACA, then replace the shortform text with the dictionary value + if '2023CACA' in relative_path: + # get the pdf filename without the extension + pdf_filename = os.path.splitext(os.path.basename(pdf_file))[0] + # replace the text with the dictionary + dict_file = df_dict_json.get(pdf_filename) + if dict_file: + for key, value in dict_file.items(): + read_text = read_text.replace(key, value) + + + document = Document(metadata={"file": relative_path}, page_content=read_text) + ls_docs.append(document) + + doc_splits = text_splitter.split_documents(ls_docs)