From 1df99f37676c0b324bddd5122bfa335f7c7819d8 Mon Sep 17 00:00:00 2001
From: leehk <leehongkai@gmail.com>
Date: Tue, 1 Apr 2025 15:07:51 +0800
Subject: [PATCH] Done with text replacement

---
 .gitignore                             |  2 ++
 app/llmops/config.yaml                 |  2 +-
 app/llmops/src/etl_chromadb_pdf/run.py | 33 +++++++++++++++++++++-----
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8d92113..821af61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -202,6 +202,8 @@ data/*
 **/.config.py
 **/chroma_db/*
 **/*.pdf
+**/documents/**/*.json
+**/documents/**/*.xlsx
 **/.env
 **/llm-template2/*
 **/llmops/outputs/*
diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml
index 834fce8..eaaf910 100644
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@@ -15,7 +15,7 @@ rag:
 testing:
   query: "如何治疗乳腺癌?"
 evaluation:
-  evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv"
+  evaluation_dataset_csv_path: "../../../../data/qa_dataset_20250401b.csv"
   evaluation_dataset_column_question: question
   evaluation_dataset_column_answer: answer
   ls_chat_model_provider:
diff --git a/app/llmops/src/etl_chromadb_pdf/run.py b/app/llmops/src/etl_chromadb_pdf/run.py
index 9b2a82b..23bfbc2 100644
--- a/app/llmops/src/etl_chromadb_pdf/run.py
+++ b/app/llmops/src/etl_chromadb_pdf/run.py
@@ -3,6 +3,8 @@
 Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact
 """
 import argparse
+import glob
+import json
 import logging
 import os
 import mlflow
@@ -108,13 +110,32 @@ def go(args):
             chunk_size=15000, chunk_overlap=7500
         )
 
+        # read the dictionary json for word replacement in the read text
+        with open(f'./{documents_folder}/2023CACA/CACA英文缩写.json', 'r', encoding='utf-8') as f:
+            df_dict_json = json.load(f)
+
         ls_docs = []
-        for root, _dir, files in os.walk(f"./{documents_folder}"):
-            for file in files:
-                if file.endswith(".pdf"):
-                    read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
-                    document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text)
-                    ls_docs.append(document)
+        pdf_files = glob.glob(f"./{documents_folder}/**/*.pdf", recursive=True)
+        
+        for pdf_file in pdf_files:
+            read_text = extract_chinese_text_from_pdf(pdf_file)
+            relative_path = os.path.relpath(pdf_file, start=f"./{documents_folder}")
+
+            # if the parent directory of the pdf file is 2023CACA, then replace the shortform text with the dictionary value
+            if '2023CACA' in relative_path:
+                # get the pdf filename without the extension
+                pdf_filename = os.path.splitext(os.path.basename(pdf_file))[0]
+                # replace the text with the dictionary
+                dict_file = df_dict_json.get(pdf_filename)
+                if dict_file:
+                    for key, value in dict_file.items():
+                        read_text = read_text.replace(key, value)
+            
+
+            document = Document(metadata={"file": relative_path}, page_content=read_text)
+            ls_docs.append(document)
+
+
                                         
         doc_splits = text_splitter.split_documents(ls_docs)