mirror of
https://github.com/aimingmed/aimingmed-ai.git
synced 2026-01-19 13:23:23 +08:00
Done with text replacement
This commit is contained in:
parent
465c24546d
commit
1df99f3767
2
.gitignore
vendored
2
.gitignore
vendored
@ -202,6 +202,8 @@ data/*
|
|||||||
**/.config.py
|
**/.config.py
|
||||||
**/chroma_db/*
|
**/chroma_db/*
|
||||||
**/*.pdf
|
**/*.pdf
|
||||||
|
**/documents/**/*.json
|
||||||
|
**/documents/**/*.xlsx
|
||||||
**/.env
|
**/.env
|
||||||
**/llm-template2/*
|
**/llm-template2/*
|
||||||
**/llmops/outputs/*
|
**/llmops/outputs/*
|
||||||
|
|||||||
@ -15,7 +15,7 @@ rag:
|
|||||||
testing:
|
testing:
|
||||||
query: "如何治疗乳腺癌?"
|
query: "如何治疗乳腺癌?"
|
||||||
evaluation:
|
evaluation:
|
||||||
evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv"
|
evaluation_dataset_csv_path: "../../../../data/qa_dataset_20250401b.csv"
|
||||||
evaluation_dataset_column_question: question
|
evaluation_dataset_column_question: question
|
||||||
evaluation_dataset_column_answer: answer
|
evaluation_dataset_column_answer: answer
|
||||||
ls_chat_model_provider:
|
ls_chat_model_provider:
|
||||||
|
|||||||
@ -3,6 +3,8 @@
|
|||||||
Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact
|
Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import mlflow
|
import mlflow
|
||||||
@ -108,13 +110,32 @@ def go(args):
|
|||||||
chunk_size=15000, chunk_overlap=7500
|
chunk_size=15000, chunk_overlap=7500
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# read the dictionary json for word replacement in the read text
|
||||||
|
with open(f'./{documents_folder}/2023CACA/CACA英文缩写.json', 'r', encoding='utf-8') as f:
|
||||||
|
df_dict_json = json.load(f)
|
||||||
|
|
||||||
ls_docs = []
|
ls_docs = []
|
||||||
for root, _dir, files in os.walk(f"./{documents_folder}"):
|
pdf_files = glob.glob(f"./{documents_folder}/**/*.pdf", recursive=True)
|
||||||
for file in files:
|
|
||||||
if file.endswith(".pdf"):
|
for pdf_file in pdf_files:
|
||||||
read_text = extract_chinese_text_from_pdf(os.path.join(root, file))
|
read_text = extract_chinese_text_from_pdf(pdf_file)
|
||||||
document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text)
|
relative_path = os.path.relpath(pdf_file, start=f"./{documents_folder}")
|
||||||
ls_docs.append(document)
|
|
||||||
|
# if the parent directory of the pdf file is 2023CACA, then replace the shortform text with the dictionary value
|
||||||
|
if '2023CACA' in relative_path:
|
||||||
|
# get the pdf filename without the extension
|
||||||
|
pdf_filename = os.path.splitext(os.path.basename(pdf_file))[0]
|
||||||
|
# replace the text with the dictionary
|
||||||
|
dict_file = df_dict_json.get(pdf_filename)
|
||||||
|
if dict_file:
|
||||||
|
for key, value in dict_file.items():
|
||||||
|
read_text = read_text.replace(key, value)
|
||||||
|
|
||||||
|
|
||||||
|
document = Document(metadata={"file": relative_path}, page_content=read_text)
|
||||||
|
ls_docs.append(document)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
doc_splits = text_splitter.split_documents(ls_docs)
|
doc_splits = text_splitter.split_documents(ls_docs)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user