mirror of
https://github.com/aimingmed/aimingmed-ai.git
synced 2026-02-08 00:03:15 +08:00
update
This commit is contained in:
parent
a0ac1fd961
commit
09673ae3fe
@ -91,8 +91,7 @@ the configuration file. It can be accessed from the `go` function as
|
|||||||
NOTE: do NOT hardcode any parameter when writing the pipeline. All the parameters should be
|
NOTE: do NOT hardcode any parameter when writing the pipeline. All the parameters should be
|
||||||
accessed from the configuration file.
|
accessed from the configuration file.
|
||||||
|
|
||||||
NOTE: Make sure the dataset file DataScientist_CaseStudy_Dataset.xlsx is located at app/mlops/components/get_data/data before
|
NOTE: Make sure you have the .env file located at the llmops/src/chain_of_thought (it contains the API keys for the LLM chat models)
|
||||||
start running the pipeline.
|
|
||||||
|
|
||||||
### Running the entire pipeline or just a selection of steps
|
### Running the entire pipeline or just a selection of steps
|
||||||
|
|
||||||
@ -101,44 +100,36 @@ then you can execute as usual:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
# not recommended for now -- still in development stage
|
# not recommended for now -- still in development stage
|
||||||
> cd app/molops
|
> cd app/llmops
|
||||||
> pipenv shell
|
> pipenv shell
|
||||||
> mlflow run .
|
> mlflow run .
|
||||||
```
|
```
|
||||||
|
|
||||||
This will run the entire pipeline. Please use the following to run working full pipeline for the project.
|
This will run the entire pipeline. Please use the following to run working full pipeline for the project.
|
||||||
You may configure all settings for both training, testing, and production testing at the app/mlops/config.yaml.
|
You may configure all settings for both training, testing, and production testing at the app/mlops/config.yaml.
|
||||||
Check all the `_steps` list you can run at app/mlops/main.py
|
Check all the `_steps` list you can run at app/llmops/main.py
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
> cd app/mlops
|
> cd app/llmops
|
||||||
> pipenv shell
|
> pipenv shell
|
||||||
> mlflow run . -P steps=download,basic_cleaning
|
> mlflow run . -P steps=get_documents,etl_chromdb_pdf,chain_of_thought
|
||||||
# before starting the ETL data_check step go to the basic_cleaning run in the wandb and assign
|
> mlflow run . -P steps=chain_of_thought
|
||||||
# the output artifact, clean_sample.csv with new alias, i.e. "reference"
|
|
||||||
> mlflow run . -P steps=data_check
|
|
||||||
# You may want to consider stratifying the data by "Sex" for
|
|
||||||
# for the train and test split, and stratify by "Sale_MF" for the propensity model if you are training "Sale_MF" model
|
|
||||||
> mlflow run . -P steps=data_split
|
|
||||||
# You may run the model training steps with train_random_forest_propensity,train_random_forest_revenue,
|
|
||||||
# and train_lasso_revenue.
|
|
||||||
# You first need to promote the best model export to "prod" before you can run test_model
|
|
||||||
# and test_production steps
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
When developing or troubleshooting, it is useful to be able to run one step at a time. Say you want to run only
|
When developing or troubleshooting, it is useful to be able to run one step at a time. Say you want to run only
|
||||||
the `basic_cleaning` step. The `main.py` is written so that the steps are defined at the top of the file, in the
|
the `chain_of_thought` step. The `main.py` is written so that the steps are defined at the top of the file, in the
|
||||||
`_steps` list, and can be selected by using the `steps` parameter on the command line:
|
`_steps` list, and can be selected by using the `steps` parameter on the command line:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
> mlflow run . -P steps=basic_cleaning
|
> mlflow run . -P steps=chain_of_thought
|
||||||
```
|
```
|
||||||
|
|
||||||
If you want to run the `basic_cleaning` and the `data_check` steps, you can similarly do:
|
If you want to run the `etl_chromdb_pdf` and the `chain_of_thought` steps, you can similarly do:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
> mlflow run . -P steps=basic_cleaning,data_check
|
> mlflow run . -P steps=etl_chromdb_pdf,chain_of_thought
|
||||||
```
|
```
|
||||||
|
|
||||||
You can override any other parameter in the configuration file using the Hydra syntax, by
|
You can override any other parameter in the configuration file using the Hydra syntax, by
|
||||||
@ -147,8 +138,8 @@ modeling -> product_to_train to Sale_MF and modeling-> stratify_by to Sale_MF:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
> mlflow run . \
|
> mlflow run . \
|
||||||
-P steps=train_random_forest_propensity \
|
-P steps=chain_of_thought \
|
||||||
-P hydra_options="modeling.product_to_train='Sale_MF' modeling.stratify_by='Sale_MF'"
|
-P hydra_options="prompt_engineering.chat_model_provider='kimi' prompt_engineering.query='怎么治疗有kras的肺癌?'"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Test the model perfomance on the test samples
|
### Test the model perfomance on the test samples
|
||||||
@ -167,16 +158,16 @@ First define the necessary parameters at the config.yaml at production.test_csv
|
|||||||
> cd app/mlops
|
> cd app/mlops
|
||||||
> pipenv shell
|
> pipenv shell
|
||||||
> mlflow run . \
|
> mlflow run . \
|
||||||
-P steps=test_production \
|
-P steps=chain_of_thought \
|
||||||
-P hydra_options="production.test_csv='clean_sample_test.csv'"
|
-P hydra_options="prompt_engineering.query='怎么治疗有kras的肺癌?'"
|
||||||
# OR you can run the following to test the production samples
|
# OR you can run the following to test the production samples
|
||||||
> mlflow run https://github.com/hkailee/financial-product-marketing-optimization.git \
|
> mlflow run https://github.com/aimingmed/aimingmed-ai \
|
||||||
-v v1.0.0 \
|
-v v1.0.0 \
|
||||||
-P steps=test_production \
|
-P steps=chain_of_thought \
|
||||||
-P hydra_options="production.test_csv='clean_sample_test.csv'"
|
-P hydra_options="prompt_engineering.query='怎么治疗有kras的肺癌?'"
|
||||||
```
|
```
|
||||||
|
|
||||||
## Wandb public workspace URL for this project
|
## Wandb public workspace URL for this project
|
||||||
|
|
||||||
Click the link below to see the wandb public workspace for this project. You can see the model training and testing results, as well as the production testing results.
|
Click the link below to see the wandb public workspace for this project. You can see the model training and testing results, as well as the production testing results.
|
||||||
https://wandb.ai/leehongkai/financial-product-marketing-optimization/table
|
https://wandb.ai/aimingmed/aimingmed-ai
|
||||||
|
|||||||
@ -9,6 +9,6 @@ etl:
|
|||||||
path_document_folder: "../../../../data"
|
path_document_folder: "../../../../data"
|
||||||
embedding_model: paraphrase-multilingual-mpnet-base-v2
|
embedding_model: paraphrase-multilingual-mpnet-base-v2
|
||||||
prompt_engineering:
|
prompt_engineering:
|
||||||
chat_model_provider: deepseek
|
chat_model_provider: kimi
|
||||||
query: "怎么治疗肺癌?"
|
query: "怎么治疗有kras的肺癌?"
|
||||||
|
|
||||||
@ -10,7 +10,7 @@ from decouple import config
|
|||||||
_steps = [
|
_steps = [
|
||||||
"get_documents",
|
"get_documents",
|
||||||
"etl_chromdb_pdf",
|
"etl_chromdb_pdf",
|
||||||
"etl_chromdb_scanned_pdf",
|
"etl_chromdb_scanned_pdf", # the performance for scanned pdf may not be good
|
||||||
"chain_of_thought"
|
"chain_of_thought"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,7 @@ build_dependencies:
|
|||||||
- python-decouple
|
- python-decouple
|
||||||
- langchain_google_genai
|
- langchain_google_genai
|
||||||
- langchain-deepseek
|
- langchain-deepseek
|
||||||
|
- langchain-community
|
||||||
# Dependencies required to run the project.
|
# Dependencies required to run the project.
|
||||||
dependencies:
|
dependencies:
|
||||||
- mlflow==2.8.1
|
- mlflow==2.8.1
|
||||||
|
|||||||
@ -9,6 +9,7 @@ from langchain.prompts import PromptTemplate
|
|||||||
from sentence_transformers import SentenceTransformer
|
from sentence_transformers import SentenceTransformer
|
||||||
from langchain_google_genai import ChatGoogleGenerativeAI
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
||||||
from langchain_deepseek import ChatDeepSeek
|
from langchain_deepseek import ChatDeepSeek
|
||||||
|
from langchain_community.llms.moonshot import Moonshot
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
|
logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
@ -16,7 +17,7 @@ logger = logging.getLogger()
|
|||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
|
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
|
||||||
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
|
DEEKSEEK_API_KEY = config("DEEKSEEK_API_KEY", cast=str)
|
||||||
|
MOONSHOT_API_KEY = config("MOONSHOT_API_KEY", cast=str)
|
||||||
|
|
||||||
def go(args):
|
def go(args):
|
||||||
run = wandb.init(job_type="chain_of_thought", entity='aimingmed')
|
run = wandb.init(job_type="chain_of_thought", entity='aimingmed')
|
||||||
@ -59,6 +60,17 @@ def go(args):
|
|||||||
max_retries=3
|
max_retries=3
|
||||||
)
|
)
|
||||||
|
|
||||||
|
elif args.chat_model_provider == "moonshot":
|
||||||
|
# Initialize Moonshot model
|
||||||
|
llm = Moonshot(
|
||||||
|
model="moonshot-v1-128k",
|
||||||
|
temperature=0,
|
||||||
|
max_tokens=None,
|
||||||
|
timeout=None,
|
||||||
|
max_retries=2,
|
||||||
|
api_key=MOONSHOT_API_KEY
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Chain of Thought Prompt
|
# Chain of Thought Prompt
|
||||||
cot_template = """Let's think step by step.
|
cot_template = """Let's think step by step.
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user