Done with text replacement

2026-01-19 13:23:23 +08:00 · 2025-04-01 15:07:51 +08:00 · 2025-04-01 15:07:51 +08:00 · 1df99f3767
commit 1df99f3767
parent 465c24546d
3 changed files with 30 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -202,6 +202,8 @@ data/*
 **/.config.py
 **/chroma_db/*
 **/*.pdf
+**/documents/**/*.json
+**/documents/**/*.xlsx
 **/.env
 **/llm-template2/*
 **/llmops/outputs/*
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@ -15,7 +15,7 @@ rag:
 testing:
  query: "如何治疗乳腺癌?"
 evaluation:
-  evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv"
+  evaluation_dataset_csv_path: "../../../../data/qa_dataset_20250401b.csv"
  evaluation_dataset_column_question: question
  evaluation_dataset_column_answer: answer
  ls_chat_model_provider:
--- a/app/llmops/src/etl_chromadb_pdf/run.py
+++ b/app/llmops/src/etl_chromadb_pdf/run.py
@ -3,6 +3,8 @@
 Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact
 """
 import argparse
+import glob
+import json
 import logging
 import os
 import mlflow
@ -108,14 +110,33 @@ def go(args):
            chunk_size=15000, chunk_overlap=7500
        )

+        # read the dictionary json for word replacement in the read text
+        with open(f'./{documents_folder}/2023CACA/CACA英文缩写.json', 'r', encoding='utf-8') as f:
+            df_dict_json = json.load(f)
+
        ls_docs = []
-        for root, _dir, files in os.walk(f"./{documents_folder}"):
-            for file in files:
-                if file.endswith(".pdf"):
-                    read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
-                    document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text)
+        pdf_files = glob.glob(f"./{documents_folder}/**/*.pdf", recursive=True)
+        
+        for pdf_file in pdf_files:
+            read_text = extract_chinese_text_from_pdf(pdf_file)
+            relative_path = os.path.relpath(pdf_file, start=f"./{documents_folder}")
+
+            # if the parent directory of the pdf file is 2023CACA, then replace the shortform text with the dictionary value
+            if '2023CACA' in relative_path:
+                # get the pdf filename without the extension
+                pdf_filename = os.path.splitext(os.path.basename(pdf_file))[0]
+                # replace the text with the dictionary
+                dict_file = df_dict_json.get(pdf_filename)
+                if dict_file:
+                    for key, value in dict_file.items():
+                        read_text = read_text.replace(key, value)
+            
+
+            document = Document(metadata={"file": relative_path}, page_content=read_text)
            ls_docs.append(document)

+
+                                        
        doc_splits = text_splitter.split_documents(ls_docs)

        # Add to vectorDB