diff --git a/.gitignore b/.gitignore index ffc3c5e..8d92113 100644 --- a/.gitignore +++ b/.gitignore @@ -208,4 +208,5 @@ data/* **/*.zip **/llm-examples/* **/*.ipynb_checkpoints -**/*.ipynb \ No newline at end of file +**/*.ipynb +**/transformer_model/* \ No newline at end of file diff --git a/app/llmops/config.yaml b/app/llmops/config.yaml index 4ea1d94..834fce8 100644 --- a/app/llmops/config.yaml +++ b/app/llmops/config.yaml @@ -15,10 +15,10 @@ rag: testing: query: "如何治疗乳腺癌?" evaluation: - evaluation_dataset_csv_path: "../../../../data/qa_dataset_01.csv" + evaluation_dataset_csv_path: "../../../../data/qa_dataset_20240321a.csv" evaluation_dataset_column_question: question evaluation_dataset_column_answer: answer ls_chat_model_provider: - - gemini - - deepseek - - moonshot \ No newline at end of file + - gemini + - deepseek + - moonshot diff --git a/app/llmops/src/etl_chromadb_pdf/run.py b/app/llmops/src/etl_chromadb_pdf/run.py index edaaa01..9b2a82b 100644 --- a/app/llmops/src/etl_chromadb_pdf/run.py +++ b/app/llmops/src/etl_chromadb_pdf/run.py @@ -105,7 +105,7 @@ def go(args): documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0] text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( - chunk_size=1000, chunk_overlap=500 + chunk_size=15000, chunk_overlap=7500 ) ls_docs = [] @@ -113,7 +113,7 @@ def go(args): for file in files: if file.endswith(".pdf"): read_text = extract_chinese_text_from_pdf(os.path.join(root, file)) - document = Document(metadata={"file": file}, page_content=read_text) + document = Document(metadata={"file": f"{documents_folder}/{file}"}, page_content=read_text) ls_docs.append(document) doc_splits = text_splitter.split_documents(ls_docs) @@ -138,7 +138,7 @@ def go(args): if __name__ == "__main__": - parser = argparse.ArgumentParser(description="A very basic data cleaning") + parser = argparse.ArgumentParser(description="ETL for ChromaDB with readable PDF") parser.add_argument( "--input_artifact", diff --git a/app/streamlit/initialize_sentence_transformer.py b/app/streamlit/initialize_sentence_transformer.py index 3026701..937c70d 100644 --- a/app/streamlit/initialize_sentence_transformer.py +++ b/app/streamlit/initialize_sentence_transformer.py @@ -4,4 +4,6 @@ from sentence_transformers import SentenceTransformer EMBEDDING_MODEL = config("EMBEDDING_MODEL", cast=str, default="paraphrase-multilingual-mpnet-base-v2") # Initialize embedding model -model = SentenceTransformer(EMBEDDING_MODEL) \ No newline at end of file +model = SentenceTransformer(EMBEDDING_MODEL) + +model.save("./transformer_model/paraphrase-multilingual-mpnet-base-v2")