update

2026-01-19 13:23:23 +08:00 · 2025-02-22 13:58:52 +08:00 · 2025-02-22 13:58:52 +08:00 · 702b1eb874
commit 702b1eb874
parent 0921f44ebd
41 changed files with 6096 additions and 3226 deletions
--- a/.gitignore
+++ b/.gitignore
@ -196,9 +196,12 @@ data/*
 # ignore macos files
 .DS_Store

-# ignore llmops
-app/llmops/*

 # ignore wandb files
 **/wandb/
 **/.config.py
+**/chroma_db/*
+**/*.pdf
+**/.env
+**/llm-template2/*
+**/llmops/outputs/*
--- a/app/docker-compose.yml
+++ b/app/docker-compose.yml
@ -0,0 +1,11 @@
+version: "3.9"
+services:
+  chroma:
+    image: ghcr.io/chroma-core/chroma:latest
+    ports:
+      - "8000:8000"
+    volumes:
+      - chroma_data:/chroma
+
+volumes:
+  chroma_data:
--- a/app/llmops/DockerFile
+++ b/app/llmops/DockerFile
@ -0,0 +1,32 @@
+FROM ubuntu:latest
+
+# Install dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa && \
+    apt-get update && \
+    apt-get install -y --no-install-recommends \
+    poppler-utils \
+    tesseract-ocr \
+    tesseract-ocr-chi-sim \
+    tesseract-ocr-chi-tra \
+    python3.9 \
+    python3.9-distutils \
+    python3-pip && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install Python libraries
+COPY MLproject python_env.yml run.py requirements.txt /app/
+
+WORKDIR /app
+
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Set environment variables
+ENV TESSDATA_PREFIX="/usr/share/tessdata"
+
+
+# Run the script (optional, you can run it when you start the container)
+# CMD ["python3.9", "chinese_pdf_embed.py"]
--- a/app/llmops/MLproject
+++ b/app/llmops/MLproject
@ -0,0 +1,18 @@
+name: llmops
+python_env: python_env.yml
+
+entry_points:
+  main:
+    parameters:
+
+      steps:
+        description: Comma-separated list of steps to execute (useful for debugging)
+        type: str
+        default: all
+
+      hydra_options:
+        description: Other configuration parameters to override
+        type: str
+        default: ''
+
+    command: "python main.py main.steps=\\'{steps}\\' $(echo {hydra_options})"
--- a/app/llmops/Pipfile
+++ b/app/llmops/Pipfile
@ -0,0 +1,35 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+hydra-core = "==1.3.2"
+pydantic = "==1.10.0"
+pyyaml = "==6.0.1"
+pip = "==24.0.0"
+mlflow = "==2.13.0"
+wandb = "*"
+cookiecutter = "==2.6.0"
+virtualenv = "*"
+docker = "*"
+ipywidgets = "*"
+ipykernel = "*"
+jupyter = "*"
+
+[dev-packages]
+pytest = "==8.0.0"
+pytest-cov = "==4.1.0"
+pytest-mock = "==3.10.0"
+requests-mock = "==1.10.0"
+black = "==23.3.0"
+ruff = "==0.0.263"
+mypy = "==1.2.0"
+nbconvert = "==7.3.1"
+jupyter = "*"
+ipykernel = "*"
+ultralytics = "==8.2.31"
+comet-ml = "==3.43.0"
+
+[requires]
+python_version = "3.11"
--- a/app/llmops/Pipfile.lock
+++ b/app/llmops/Pipfile.lock
--- a/app/llmops/README.md
+++ b/app/llmops/README.md
@ -0,0 +1,182 @@
+# Build an ML Pipeline for OSCAR object identification
+
+## Table of contents
+
+- [Preliminary steps](#preliminary-steps)
+  - [Create environment](#create-environment)
+  - [Get API key for Weights and Biases](#get-api-key-for-weights-and-biases)
+  - [Cookie cutter](#cookie-cutter)
+  - [The configuration](#the-configuration)
+  - [Running the entire pipeline or just a selection of steps](#running-the-entire-pipeline-or-just-a-selection-of-steps)
+  - [Test the model perfomance on the test samples](#test-the-model-perfomance-on-the-test-samples)
+  - [Test the production test samples based on production models](#test-production-samples-based-on-production-models)
+  - [Wandb public workspace URL for this project](#wandb-public-workspace-url-for-this-project)
+
+## Preliminary steps
+
+### Create environment
+
+Make sure to have pipenv installed and ready and you are at the app/mlops directory, then create a new environment:
+
+```bash
+# you may remove Pipfile.lock
+> pipenv install
+> pipenv shell
+```
+
+### Get API key for Weights and Biases
+
+Let's make sure we are logged in to Weights & Biases. Get your API key from W&B by going to
+[https://wandb.ai/authorize](https://wandb.ai/authorize) and click on the + icon (copy to clipboard),
+then paste your key into this command:
+
+```bash
+> wandb login [your API key]
+```
+
+You should see a message similar to:
+
+```
+wandb: Appending key for api.wandb.ai to your netrc file: /home/[your username]/.netrc
+```
+
+### Cookie cutter
+
+In order to make your job a little easier, you are provided a cookie cutter template that you can use to create
+stubs for new pipeline components. It is not required that you use this, but it might save you from a bit of
+boilerplate code. Just run the cookiecutter and enter the required information, and a new component
+will be created including the `python_env.yml` file, the `MLproject` file as well as the script. You can then modify these
+as needed, instead of starting from scratch.
+For example:
+
+```bash
+> cookiecutter cookie-mlflow-step -o src
+
+step_name [step_name]: basic_cleaning
+script_name [run.py]: run.py
+job_type [my_step]: basic_cleaning
+short_description [My step]: This steps cleans the data
+long_description [An example of a step using MLflow and Weights & Biases]: Performs basic cleaning on the data and save the results in Weights & Biases
+parameters [parameter1,parameter2]: parameter1,parameter2,parameter3
+```
+
+This will create a step called `basic_cleaning` under the directory `src` with the following structure:
+
+```bash
+> ls src/basic_cleaning/
+python_env.yml  MLproject  run.py
+```
+
+You can now modify the script (`run.py`), the virtualenv environment (`python_env.yml`) and the project definition
+(`MLproject`) as you please.
+
+The script `run.py` will receive the input parameters `parameter1`, `parameter2`,
+`parameter3` and it will be called like:
+
+```bash
+> mlflow run src/step_name -P parameter1=1 -P parameter2=2 -P parameter3="test"
+```
+
+### The configuration
+
+As usual, the parameters controlling the pipeline are defined in the `config.yaml` file defined in
+the root of the starter kit. We will use Hydra to manage this configuration file.
+Open this file and get familiar with its content. Remember: this file is only read by the `main.py` script
+(i.e., the pipeline) and its content is
+available with the `go` function in `main.py` as the `config` dictionary. For example,
+the name of the project is contained in the `project_name` key under the `main` section in
+the configuration file. It can be accessed from the `go` function as
+`config["main"]["project_name"]`.
+
+NOTE: do NOT hardcode any parameter when writing the pipeline. All the parameters should be
+accessed from the configuration file.
+
+NOTE: Make sure the dataset file DataScientist_CaseStudy_Dataset.xlsx is located at app/mlops/components/get_data/data before
+start running the pipeline.
+
+### Running the entire pipeline or just a selection of steps
+
+In order to run the pipeline when you are developing, you need to be in the root of the starter kit,
+then you can execute as usual:
+
+```bash
+# not recommended for now -- still in development stage
+> cd app/molops
+> pipenv shell
+> mlflow run .
+```
+
+This will run the entire pipeline. Please use the following to run working full pipeline for the project.
+You may configure all settings for both training, testing, and production testing at the app/mlops/config.yaml.
+Check all the `_steps` list you can run at app/mlops/main.py
+
+```bash
+> cd app/mlops
+> pipenv shell
+> mlflow run . -P steps=download,basic_cleaning
+# before starting the ETL data_check step go to the basic_cleaning run in the wandb and assign
+# the output artifact, clean_sample.csv with new alias, i.e. "reference"
+> mlflow run . -P steps=data_check
+# You may want to consider stratifying the data by "Sex" for
+# for the train and test split, and stratify by "Sale_MF" for the propensity model if you are training "Sale_MF" model
+> mlflow run . -P steps=data_split
+# You may run the model training steps with train_random_forest_propensity,train_random_forest_revenue,
+# and train_lasso_revenue.
+# You first need to promote the best model export to "prod" before you can run test_model
+# and test_production steps
+
+```
+
+When developing or troubleshooting, it is useful to be able to run one step at a time. Say you want to run only
+the `basic_cleaning` step. The `main.py` is written so that the steps are defined at the top of the file, in the
+`_steps` list, and can be selected by using the `steps` parameter on the command line:
+
+```bash
+> mlflow run . -P steps=basic_cleaning
+```
+
+If you want to run the `basic_cleaning` and the `data_check` steps, you can similarly do:
+
+```bash
+> mlflow run . -P steps=basic_cleaning,data_check
+```
+
+You can override any other parameter in the configuration file using the Hydra syntax, by
+providing it as a `hydra_options` parameter. For example, say that we want to set the parameter
+modeling -> product_to_train to Sale_MF and modeling-> stratify_by to Sale_MF:
+
+```bash
+> mlflow run . \
+  -P steps=train_random_forest_propensity \
+  -P hydra_options="modeling.product_to_train='Sale_MF' modeling.stratify_by='Sale_MF'"
+```
+
+### Test the model perfomance on the test samples
+
+First define the necessary parameters at the config.yaml at best_model_propensity and best_model_revenue sections, remember to set "prod" alias for the best_model_propensity and best_model_revenue to the best models you have trained before on wandb model output artifacts before you can run the testing on test samples to see the model performance on the test holdout samples. The test performance plots and test_result.csv are available at the wandb run log and output artifacts.
+
+```bash
+> mlflow run . -P steps=test_model
+```
+
+### Test the production test samples based on production models
+
+First define the necessary parameters at the config.yaml at production.test_csv section, remember to set "prod" alias for the best_model_propensity and best_model_revenue to the best model you have trained before on wandb model output artifacts before you can run the testing on production.
+
+```bash
+> cd app/mlops
+> pipenv shell
+> mlflow run . \
+  -P steps=test_production \
+  -P hydra_options="production.test_csv='clean_sample_test.csv'"
+# OR you can run the following to test the production samples
+> mlflow run https://github.com/hkailee/financial-product-marketing-optimization.git \
+             -v v1.0.0 \
+              -P steps=test_production \
+             -P hydra_options="production.test_csv='clean_sample_test.csv'"
+```
+
+## Wandb public workspace URL for this project
+
+Click the link below to see the wandb public workspace for this project. You can see the model training and testing results, as well as the production testing results.
+https://wandb.ai/leehongkai/financial-product-marketing-optimization/table
--- a/app/llmops/components/get_documents/MLproject
+++ b/app/llmops/components/get_documents/MLproject
@ -0,0 +1,29 @@
+name: get_documents
+python_env: python_env.yml
+
+entry_points:
+  main:
+    parameters:
+
+      document_folder:
+        description: Documents to download
+        type: string
+
+      path_document_folder:
+        description: Path to the folder containing the documents
+        type: string
+
+      artifact_name:
+        description: Name for the output artifact
+        type: string
+
+      artifact_type:
+        description: Type of the output artifact. This will be used to categorize the artifact in the W&B
+                     interface
+        type: string
+
+      artifact_description:
+        description: A brief description of the output artifact
+        type: string
+
+    command: "python run.py {document_folder} {path_document_folder} {artifact_name} {artifact_type} {artifact_description}"
--- a/app/llmops/components/get_documents/python_env.yml
+++ b/app/llmops/components/get_documents/python_env.yml
@ -0,0 +1,14 @@
+# Python version required to run the project.
+python: "3.11.11"
+# Dependencies required to build packages. This field is optional.
+build_dependencies:
+  - pip==23.3.1
+  - setuptools
+  - wheel==0.37.1
+  - requests==2.24.0
+
+# Dependencies required to run the project.
+dependencies:
+  - mlflow==2.8.1
+  - wandb==0.16.0
+  - git+https://github.com/udacity/nd0821-c2-build-model-workflow-starter.git#egg=wandb-utils&subdirectory=components
--- a/app/llmops/components/get_documents/run.py
+++ b/app/llmops/components/get_documents/run.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python
+"""
+This script download a URL to a local destination
+"""
+import argparse
+import logging
+import os
+
+
+import wandb
+
+from wandb_utils.log_artifact import log_artifact
+import shutil
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
+logger = logging.getLogger()
+
+
+def go(args):
+
+    zip_path = os.path.join(args.path_document_folder, f"{args.document_folder}.zip")
+    shutil.make_archive(zip_path.replace('.zip', ''), 'zip', args.path_document_folder, args.document_folder)
+
+    run = wandb.init(job_type="get_documents", entity='aimingmed')
+    run.config.update(args)
+
+    logger.info(f"Uploading {args.artifact_name} to Weights & Biases")
+    log_artifact(
+        args.artifact_name,
+        args.artifact_type,
+        args.artifact_description,
+        zip_path,
+        run,
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download URL to a local destination")
+
+    parser.add_argument("document_folder", type=str, help="Name of the sample to download")
+
+    parser.add_argument("path_document_folder", type=str, help="Path to the document folder")
+
+    parser.add_argument("artifact_name", type=str, help="Name for the output artifact")
+
+    parser.add_argument("artifact_type", type=str, help="Output artifact type.")
+
+    parser.add_argument(
+        "artifact_description", type=str, help="A brief description of this artifact"
+    )
+
+    args = parser.parse_args()
+
+    go(args)
--- a/app/llmops/components/wandb_utils/init.py
+++ b/app/llmops/components/wandb_utils/init.py
--- a/app/llmops/components/wandb_utils/log_artifact.py
+++ b/app/llmops/components/wandb_utils/log_artifact.py
@ -0,0 +1,28 @@
+import wandb
+import mlflow
+
+
+def log_artifact(artifact_name, artifact_type, artifact_description, filename, wandb_run):
+    """
+    Log the provided filename as an artifact in W&B, and add the artifact path to the MLFlow run
+    so it can be retrieved by subsequent steps in a pipeline
+
+    :param artifact_name: name for the artifact
+    :param artifact_type: type for the artifact (just a string like "raw_data", "clean_data" and so on)
+    :param artifact_description: a brief description of the artifact
+    :param filename: local filename for the artifact
+    :param wandb_run: current Weights & Biases run
+    :return: None
+    """
+    # Log to W&B
+    artifact = wandb.Artifact(
+        artifact_name,
+        type=artifact_type,
+        description=artifact_description,
+    )
+    artifact.add_file(filename)
+    wandb_run.log_artifact(artifact)
+    # We need to call this .wait() method before we can use the
+    # version below. This will wait until the artifact is loaded into W&B and a
+    # version is assigned
+    artifact.wait()
--- a/app/llmops/components/wandb_utils/sanitize_path.py
+++ b/app/llmops/components/wandb_utils/sanitize_path.py
@ -0,0 +1,15 @@
+import os
+
+
+def sanitize_path(s):
+    """
+    Sanitizes the input path by:
+
+    1. Expanding environment variables
+    2. Expanding the home directory ('~')
+    3. Calculating the absolute path
+
+    :param s: input path
+    :return: a sanitized version of the input path
+    """
+    return os.path.abspath(os.path.expanduser(os.path.expandvars(s)))
--- a/app/llmops/config.yaml
+++ b/app/llmops/config.yaml
@ -0,0 +1,9 @@
+main:
+  # Set this to null if you are running in prod
+  project_name: aimingmed-ai
+  experiment_name: development
+  steps: all
+etl:
+  document_folder: "documents"
+  path_document_folder: "../../../../data"
+  
--- a/app/llmops/cookie-mlflow-step/README.md
+++ b/app/llmops/cookie-mlflow-step/README.md
@ -0,0 +1,17 @@
+# Cookiecutter template for MLFlow steps
+
+Using this template you can quickly generate new steps to be used with MLFlow.
+
+# Usage
+Run the command:
+
+```
+> cookiecutter [path to this repo] -o [destination directory]
+```
+
+and follow the prompt. The tool will ask for the step name, the script name, the description and so on. It will
+also ask for the parameters of the script. This should be a comma-separated list of parameter names *without spaces*. 
+After you are done, you will find a new directory with the provided step name containing a stub of a new MLflow step.
+
+You will need to edit both the script and the MLproject files to fill in the type and the help for the parameters.
+Of course, if your script needs packages, these should be added to the conda.yml file.
--- a/app/llmops/cookie-mlflow-step/cookiecutter.json
+++ b/app/llmops/cookie-mlflow-step/cookiecutter.json
@ -0,0 +1,8 @@
+{
+	"step_name": "step_name",
+	"script_name": "run.py",
+	"job_type": "my_step",
+	"short_description": "My step",
+	"long_description": "An example of a step using MLflow and Weights & Biases",
+	"parameters": "parameter1,parameter2"
+}
--- a/app/llmops/cookie-mlflow-step/{{cookiecutter.step_name}}/MLproject
+++ b/app/llmops/cookie-mlflow-step/{{cookiecutter.step_name}}/MLproject
@ -0,0 +1,14 @@
+name: {{cookiecutter.step_name}}
+python_env: python_env.yml
+
+entry_points:
+  main:
+    parameters:
+{% for arg_name in cookiecutter.parameters.split(",") %}
+      {{arg_name}}:
+        description: ## ADD DESCRIPTION
+        type: string
+{% endfor %}
+
+    command: >-
+        python {{cookiecutter.script_name}} {% for n in cookiecutter.parameters.split(",") %} --{{n}} {{"{"}}{{n}}{{"}"}} {% endfor %}
--- a/app/llmops/cookie-mlflow-step/{{cookiecutter.step_name}}/python_env.yml
+++ b/app/llmops/cookie-mlflow-step/{{cookiecutter.step_name}}/python_env.yml
@ -0,0 +1,11 @@
+# Python version required to run the project.
+python: "3.9.17"
+# Dependencies required to build packages. This field is optional.
+build_dependencies:
+  - pip==23.3.1
+  - setuptools
+  - wheel==0.37.1
+# Dependencies required to run the project.
+dependencies:
+  - mlflow==2.8.1
+  - wandb==0.16.0
--- a/app/llmops/cookie-mlflow-step/{{cookiecutter.step_name}}/{{cookiecutter.script_name}}
+++ b/app/llmops/cookie-mlflow-step/{{cookiecutter.step_name}}/{{cookiecutter.script_name}}
@ -0,0 +1,43 @@
+#!/usr/bin/env python
+"""
+{{cookiecutter.long_description}}
+"""
+import argparse
+import logging
+import wandb
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
+logger = logging.getLogger()
+
+
+def go(args):
+
+    run = wandb.init(job_type="{{cookiecutter.job_type}}")
+    run.config.update(args)
+
+    # Download input artifact. This will also log that this script is using this
+    # particular version of the artifact
+    # artifact_local_path = run.use_artifact(args.input_artifact).file()
+
+    ######################
+    # YOUR CODE HERE     #
+    ######################
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="{{cookiecutter.short_description}}")
+
+{% for arg_name in cookiecutter.parameters.split(",") %}
+    parser.add_argument(
+        "--{{arg_name}}", 
+        type=## INSERT TYPE HERE: str, float or int,
+        help=## INSERT DESCRIPTION HERE,
+        required=True
+    )
+{% endfor %}
+
+    args = parser.parse_args()
+
+    go(args)
--- a/app/llmops/main.py
+++ b/app/llmops/main.py
@ -0,0 +1,204 @@
+import json
+
+import mlflow
+import tempfile
+import os
+import wandb
+import hydra
+from omegaconf import DictConfig
+from decouple import config
+
+_steps = [
+    "get_documents",
+    "etl_chromdb_scanned_pdf",
+    "data_check",
+    "data_split",
+    "train_random_forest_propensity",
+    "train_random_forest_revenue",
+    "train_lasso_revenue",
+    # NOTE: We do not include this in the steps so it is not run by mistake.
+    # You first need to promote a model export to "prod" before you can run this,
+    # then you need to run this step explicitly
+   "test_model",
+   "test_production"
+]
+
+GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
+
+
+
+# This automatically reads in the configuration
+@hydra.main(config_name='config')
+def go(config: DictConfig):
+
+    # Setup the wandb experiment. All runs will be grouped under this name
+    os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
+    os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
+
+    # Steps to execute
+    steps_par = config['main']['steps']
+    active_steps = steps_par.split(",") if steps_par != "all" else _steps
+
+    # Move to a temporary directory
+    with tempfile.TemporaryDirectory() as tmp_dir:
+
+        if "get_documents" in active_steps:
+            # Download file and load in W&B
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "get_documents"),
+                "main",
+                parameters={
+                    "document_folder": config["etl"]["document_folder"],
+                    "path_document_folder": config["etl"]["path_document_folder"],
+                    "artifact_name": "documents",
+                    "artifact_type": "raw_data",
+                    "artifact_description": "Raw file as downloaded"
+                },
+            )
+        if "etl_chromdb_scanned_pdf" in active_steps:
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "etl_chromdb_scanned_pdf"),
+                "main",
+                parameters={
+                    "input_artifact": "documents:latest",
+                    "output_artifact": "chromdb.zip",
+                    "output_type": "chromdb",
+                    "output_description": "Scanned Documents in pdf to be read amd stored in chromdb",
+                    "gemini_api_key": GEMINI_API_KEY,
+                },
+            )
+
+        if "data_check" in active_steps:
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
+                "main",
+                parameters={
+                    "csv": f"{config['data_check']['csv_to_check']}:latest",
+                    "ref": "clean_sample.csv:reference",
+                    "kl_threshold": config['data_check']['kl_threshold'],
+                    "min_age": config['etl']['min_age'],
+                    "max_age": config['etl']['max_age'],
+                    "min_tenure": config['etl']['min_tenure'],
+                    "max_tenure": config['etl']['max_tenure']
+                },
+            )
+
+        if "data_split" in active_steps:
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "train_val_test_split"),
+                "main",
+                parameters={
+                    "input": "clean_sample.csv:latest",
+                    "test_size": config['modeling']['test_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "stratify_by": config['modeling']['stratify_by'],
+                },
+            )
+
+        if "train_random_forest_propensity" in active_steps:
+
+            # NOTE: we need to serialize the random forest configuration into JSON
+            rf_config = os.path.abspath("rf_config.json")
+            with open(rf_config, "w+") as fp:
+                json.dump(dict(config["modeling"]["random_forest_classifier_propensity"].items()), fp)  # DO NOT TOUCH
+
+            # NOTE: use the rf_config we created as the rf_config parameter for the train_random_forest
+            # step
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest_propensity"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config['modeling']['val_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "ls_output_columns": ','.join(config['modeling']['ls_output_columns']),
+                    "product": config['modeling']['product_to_train'],
+                    "stratify_by": config['modeling']['stratify_by'],
+                    "n_folds": config['modeling']['n_folds'],
+                    "rf_config": rf_config,
+                    "output_artifact": "random_forest_export",
+                },
+            )
+        
+        if "train_random_forest_revenue" in active_steps:
+
+            # NOTE: we need to serialize the random forest configuration into JSON
+            rf_config = os.path.abspath("rf_config_revenue.json")
+            with open(rf_config, "w+") as fp:
+                json.dump(dict(config["modeling"]["random_forest_regression_revenue"].items()), fp)
+            
+            # NOTE: use the rf_config we created as the rf_config parameter for the train_random_forest
+            # step
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest_revenue"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config['modeling']['val_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "ls_output_columns": ','.join(config['modeling']['ls_output_columns']),
+                    "product": config['modeling']['product_to_train'],
+                    "stratify_by": config['modeling']['stratify_by'],
+                    "n_folds": config['modeling']['n_folds'],
+                    "rf_config": rf_config,
+                    "output_artifact": "random_forest_export",
+                },
+            )
+
+        if "train_lasso_revenue" in active_steps:
+
+            # NOTE: use the lasso_config we created as the lasso_config parameter for the train_lasso
+            lasso_config = os.path.abspath("lasso_config.json")
+            with open(lasso_config, "w+") as fp:
+                json.dump(dict(config["modeling"]["lasso_regression_revenue"].items()), fp)
+
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "src", "train_lasso_revenue"),
+                "main",
+                parameters={
+                    "trainval_artifact": "trainval_data.csv:latest",
+                    "val_size": config['modeling']['val_size'],
+                    "random_seed": config['modeling']['random_seed'],
+                    "ls_output_columns": ','.join(config['modeling']['ls_output_columns']),
+                    "product": config['modeling']['product_to_train'],
+                    "stratify_by": config['modeling']['stratify_by'],
+                    "n_folds": config['modeling']['n_folds'],
+                    "lasso_config": lasso_config,
+                    "output_artifact": "lasso_export",
+                },
+            )
+
+        if "test_model" in active_steps:
+
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "test_model"),
+                "main",
+                parameters={
+                    "model_propensity_cc": config['best_model_propensity']['propensity_cc'],
+                    "model_propensity_cl": config['best_model_propensity']['propensity_cl'],
+                    "model_propensity_mf": config['best_model_propensity']['propensity_mf'],
+                    "model_revenue_cc": config['best_model_revenue']['revenue_cc'],
+                    "model_revenue_cl": config['best_model_revenue']['revenue_cl'],
+                    "model_revenue_mf": config['best_model_revenue']['revenue_mf'],
+                    "test_dataset": "test_data.csv:latest",
+                },
+            )
+        
+        if "test_production" in active_steps:
+            
+            _ = mlflow.run(
+                os.path.join(hydra.utils.get_original_cwd(), "components", "test_production"),
+                "main",
+                parameters={
+                    "model_propensity_cc": config['best_model_propensity']['propensity_cc'],
+                    "model_propensity_cl": config['best_model_propensity']['propensity_cl'],
+                    "model_propensity_mf": config['best_model_propensity']['propensity_mf'],
+                    "model_revenue_cc": config['best_model_revenue']['revenue_cc'],
+                    "model_revenue_cl": config['best_model_revenue']['revenue_cl'],
+                    "model_revenue_mf": config['best_model_revenue']['revenue_mf'],
+                    "test_dataset": f"{config['production']['test_csv']}:latest",
+                },
+            )
+
+if __name__ == "__main__":
+    go()
--- a/app/llmops/python_env.yml
+++ b/app/llmops/python_env.yml
@ -0,0 +1,16 @@
+# Python version required to run the project.
+python: "3.11.11"
+# Dependencies required to build packages. This field is optional.
+build_dependencies:
+  - pip==23.3.1
+  - setuptools
+  - wheel==0.37.1
+  - pyyaml
+  - hydra-core==1.3.2
+  - virtualenv
+  - python-decouple
+
+# Dependencies required to run the project.
+dependencies:
+  - mlflow==2.8.1
+  - wandb==0.19.4
--- a/app/llmops/src/etl_chromdb_scanned_pdf/MLproject
+++ b/app/llmops/src/etl_chromdb_scanned_pdf/MLproject
@ -0,0 +1,34 @@
+name: etl_chromdb_scanned_pdf
+python_env: python_env.yml
+
+entry_points:
+  main:
+    parameters:
+
+      input_artifact:
+        description: Fully-qualified name for the input artifact
+        type: string
+
+      output_artifact:
+        description: Name for the output artifact
+        type: string
+
+      output_type:
+        description: Type for the artifact output
+        type: string
+
+      output_description:
+        description: Description for the artifact
+        type: string
+
+      gemini_api_key:
+        description: API key for Gemini
+        type: string
+
+
+    command: >-
+        python run.py --input_artifact {input_artifact} \
+                      --output_artifact {output_artifact} \
+                      --output_type {output_type} \
+                      --output_description {output_description} \
+                      --gemini_api_key {gemini_api_key}
--- a/app/llmops/src/etl_chromdb_scanned_pdf/python_env.yml
+++ b/app/llmops/src/etl_chromdb_scanned_pdf/python_env.yml
@ -0,0 +1,16 @@
+# Python version required to run the project.
+python: "3.11.11"
+# Dependencies required to build packages. This field is optional.
+build_dependencies:
+  - pip==23.3.1
+  - setuptools
+  - wheel==0.37.1
+  - chromadb
+  - openai
+  - pytesseract
+  - pdf2image
+  - langchain
+# Dependencies required to run the project.
+dependencies:
+  - mlflow==2.8.1
+  - wandb==0.16.0
--- a/app/llmops/src/etl_chromdb_scanned_pdf/run.py
+++ b/app/llmops/src/etl_chromdb_scanned_pdf/run.py
@ -0,0 +1,173 @@
+#!/usr/bin/env python
+"""
+Download from W&B the raw dataset and apply some basic data cleaning, exporting the result to a new artifact
+"""
+import argparse
+import logging
+import os
+import wandb
+import shutil
+
+import chromadb
+from openai import OpenAI
+from typing import List
+import numpy as np
+import pytesseract as pt
+from pdf2image import convert_from_path
+from langchain.schema import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)-15s %(message)s")
+logger = logging.getLogger()
+
+
+def extract_text_from_pdf_ocr(pdf_path):
+    try:
+        images = convert_from_path(pdf_path) # Convert PDF pages to images
+        extracted_text = ""
+        for image in images:
+            text = pt.image_to_string(image, lang="chi_sim+eng")  # chi_sim for Simplified Chinese, chi_tra for Traditional
+
+            extracted_text += text + "\n"
+        return extracted_text
+
+    except ImportError:
+      print("Error: pdf2image or pytesseract not installed. Please install them: pip install pdf2image pytesseract")
+      return ""
+    except Exception as e:
+        print(f"OCR failed: {e}")
+        return ""
+
+
+
+def go(args):
+    """
+    Run the etl for chromdb with scanned pdf
+    """
+
+    run = wandb.init(job_type="etl_chromdb_scanned_pdf", entity='aimingmed')
+    run.config.update(args)
+
+    # Setup the Gemini client
+    client = OpenAI(
+        api_key=args.gemini_api_key,
+        base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
+    )
+
+
+    def get_google_embedding(text: str) -> List[float]:
+        response = client.embeddings.create(
+            model="text-embedding-004",
+            input=text
+        )
+        return response.data[0].embedding
+
+    class GeminiEmbeddingFunction(object):
+        def __init__(self, api_key: str, base_url: str, model_name: str):
+            self.client = OpenAI(
+                api_key=args.gemini_api_key,
+                base_url=base_url
+            )
+            self.model_name = model_name
+
+        def __call__(self, input: List[str]) -> List[List[float]]:
+                all_embeddings = []
+                for text in input:
+                    response = self.client.embeddings.create(input=text, model=self.model_name)
+                    embeddings = [record.embedding for record in response.data]
+                    all_embeddings.append(np.array(embeddings[0]))
+                return all_embeddings
+
+    # Define embedding function
+    gemini_ef = GeminiEmbeddingFunction(
+        api_key=args.gemini_api_key,
+        base_url="https://generativelanguage.googleapis.com/v1beta/openai/",
+        model_name="text-embedding-004"
+    )
+
+    # Create database, delete the database directory if it exists
+    db_folder = "chroma_db"
+    db_path = os.path.join(os.getcwd(), db_folder)
+    if os.path.exists(db_path):
+        shutil.rmtree(db_path)
+    os.makedirs(db_path)
+
+    chroma_client = chromadb.PersistentClient(path=db_path)
+    collection_name = "rag_experiment"
+    db = chroma_client.create_collection(name=collection_name, embedding_function=gemini_ef)
+
+
+    logger.info("Downloading artifact")
+    artifact_local_path = run.use_artifact(args.input_artifact).file()
+
+    logger.info("Reading data")
+
+    # unzip the downloaded artifact
+    import zipfile
+    with zipfile.ZipFile(artifact_local_path, 'r') as zip_ref:
+        zip_ref.extractall(".")
+    os.remove(artifact_local_path)
+
+    # show the unzipped folder
+    documents_folder = os.path.splitext(os.path.basename(artifact_local_path))[0]
+
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
+
+    for root, _dir, files in os.walk(f"./{documents_folder}"):
+        for file in files:
+            if file.endswith(".pdf"):
+                read_text = extract_text_from_pdf_ocr(os.path.join(root, file))
+                document = Document(page_content=read_text)
+                all_splits = text_splitter.split_documents([document])
+                
+                for i, split in enumerate(all_splits):
+                    db.add(documents=[split.page_content], 
+                        metadatas=[{"filename": file}],
+                        ids=[file[:-4] + str(i)])
+                    
+    
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="A very basic data cleaning")
+
+    parser.add_argument(
+        "--input_artifact", 
+        type=str,
+        help="Fully-qualified name for the input artifact",
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_artifact", 
+        type=str,
+        help="Name for the output artifact",
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_type", 
+        type=str,
+        help="Type for the artifact output",
+        required=True
+    )
+
+    parser.add_argument(
+        "--output_description", 
+        type=str,
+        help="Description for the artifact",
+        required=True
+    )
+
+    parser.add_argument(
+        "--gemini_api_key",
+        type=str,
+        help="API key for the Gemini service",
+        required=True
+    )
+
+
+    args = parser.parse_args()
+
+    go(args)
--- a/scripts/llm-template/Pipfile
+++ b/scripts/llm-template/Pipfile
@ -1,25 +0,0 @@
-[[source]]
-url = "https://pypi.org/simple"
-verify_ssl = true
-name = "pypi"
-
-[packages]
-python-decouple = "*"
-langchain = "*"
-google-genai = "*"
-chromadb = "*"
-SQLAlchemy = "*"
-wandb = "*"
-langchain-google-genai = "*"
-langchain-google-vertexai = "*"
-langchain-mongodb = "*"
-langgraph = "*"
-langchain-community = "*"
-beautifulsoup4 = "*"
-openai = "*"
-pydantic = "*"
-
-[dev-packages]
-
-[requires]
-python_version = "3.10"
--- a/scripts/llm-template/Pipfile.lock
+++ b/scripts/llm-template/Pipfile.lock
--- a/scripts/llm-template/README.md
+++ b/scripts/llm-template/README.md
@ -1,23 +0,0 @@
-# LLM Project Template
-
-This template provides a starting point for building LLM-powered applications with Langchain, AI Agents, RAG, Summarization, Question-Answering with SQL, and LLMOps.
-
-## Requirements
-
- Python 3.9+
- Pipenv
-
-## Installation
-
-```bash
-pipenv install
-```
-
-## Usage
-
-1.  Set the `OPENAI_API_KEY`, `WANDB_API_KEY`, `GOOGLE_API_KEY`, and `LANGSMITH_API_KEY` environment variables in `config.py`.
-2.  Run the `main.py` file:
-
-```bash
-pipenv run python main.py
-```
--- a/scripts/llm-template/agents/agent1.py
+++ b/scripts/llm-template/agents/agent1.py
@ -1,3 +0,0 @@
-# agent1.py
-
-# Add your agent implementation here
--- a/scripts/llm-template/agents/agent2.py
+++ b/scripts/llm-template/agents/agent2.py
@ -1,3 +0,0 @@
-# agent2.py
-
-# Add your agent implementation here
--- a/scripts/llm-template/chains/qa_chain.py
+++ b/scripts/llm-template/chains/qa_chain.py
@ -1,3 +0,0 @@
-# qa_chain.py
-
-# Add your question-answering chain implementation here
--- a/scripts/llm-template/chains/rag_chain.py
+++ b/scripts/llm-template/chains/rag_chain.py
@ -1,3 +0,0 @@
-# rag_chain.py
-
-# Add your RAG chain implementation here
--- a/scripts/llm-template/chains/sql_chain.py
+++ b/scripts/llm-template/chains/sql_chain.py
@ -1,3 +0,0 @@
-# sql_chain.py
-
-# Add your question-answering with SQL chain implementation here
--- a/scripts/llm-template/chains/summarization_chain.py
+++ b/scripts/llm-template/chains/summarization_chain.py
@ -1,3 +0,0 @@
-# summarization_chain.py
-
-# Add your summarization chain implementation here
--- a/scripts/llm-template/llmops/deployment.py
+++ b/scripts/llm-template/llmops/deployment.py
@ -1,3 +0,0 @@
-# deployment.py
-
-# Add your deployment scripts and configurations here
--- a/scripts/llm-template/llmops/evaluation.py
+++ b/scripts/llm-template/llmops/evaluation.py
@ -1,3 +0,0 @@
-# evaluation.py
-
-# Add your evaluation metrics and scripts here
--- a/scripts/llm-template/llmops/monitoring.py
+++ b/scripts/llm-template/llmops/monitoring.py
@ -1,3 +0,0 @@
-# monitoring.py
-
-# Add your monitoring and logging implementation here
--- a/scripts/llm-template/main.py
+++ b/scripts/llm-template/main.py
@ -1,31 +0,0 @@
-import os
-from decouple import config
-
-from langchain.chat_models import init_chat_model
-from langchain_google_vertexai import VertexAIEmbeddings
-from langchain_mongodb import MongoDBAtlasVectorSearch
-
-
-# Get the BASE_URL from the environment variables
-GOOGLE_API_KEY = config("GOOGLE_API_KEY", cast=str)
-WANDB_API_KEY = config("WANDB_API_KEY", cast=str)
-LANGSMITH_API_KEY = config("LANGSMITH_API_KEY", cast=str)
-LANGCHAIN_PROJECT = config("LANGCHAIN_PROJECT", cast=str)
-
-# Set environment variables
-os.environ["LANGCHAIN_TRACING"] = "true"
-os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT
-os.environ["LANGCHAIN_API_KEY"] = LANGSMITH_API_KEY
-os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
-
-llm = init_chat_model("gemini-2.0-flash-001", model_provider="google_vertexai")
-
-embeddings = VertexAIEmbeddings(model="text-embedding-004")
-
-vector_store = MongoDBAtlasVectorSearch(
-    embedding=embeddings,
-    collection=MONGODB_COLLECTION,
-    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
-    relevance_score_fn="cosine",
-)
--- a/scripts/llm-template/main_old.py
+++ b/scripts/llm-template/main_old.py
@ -1,125 +0,0 @@
-# main.py
-import os
-import wandb
-from config import GOOGLE_API_KEY, WANDB_API_KEY, LANGSMITH_API_KEY, LANGCHAIN_PROJECT
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain.callbacks import LangChainTracer
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-
-# Set LangSmith environment variables
-os.environ["LANGCHAIN_TRACING"] = "true"
-os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
-os.environ["LANGCHAIN_PROJECT"] = LANGCHAIN_PROJECT
-os.environ["LANGCHAIN_API_KEY"] = LANGSMITH_API_KEY
-
-# Initialize Weights & Biases
-wandb.login(key=WANDB_API_KEY)
-run = wandb.init(project=LANGCHAIN_PROJECT, entity="aimingmed")
-
-# Initialize Gemini API
-tracer = LangChainTracer()
-llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", google_api_key=GOOGLE_API_KEY, callbacks=[tracer])
-
-# Example usage of Gemini API
-prompt_template = PromptTemplate(template="Write a short poem about the sun.", input_variables=[])
-chain = LLMChain(llm=llm, prompt=prompt_template)
-response = chain.run({})
-
-# print(response)
-
-import time
-from langsmith import Client, Run
-from langsmith.evaluation import EvaluationResult
-
-# Initialize LangSmith client
-client = Client(api_key=LANGSMITH_API_KEY)  # Replace with your API key
-project_name = "my-gemini-evaluation"  # Your LangSmith project name
-
-def evaluate_gemini_response(prompt, expected_response, gemini_response):
-    """Evaluates Gemini's response against an expected response and logs to LangSmith."""
-
-    # 1. Create a Run object (This is the correct way for manual logging)
-    run = Run(
-        client=client,
-        project_name=project_name,
-        inputs={"prompt": prompt, "expected_response": expected_response},  # Log inputs here
-    )
-
-    try:  # Use a try-except block for proper error handling
-        start_time = time.time()
-        
-        # ... (Your Gemini API call goes here) ...
-        llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-001", google_api_key=GOOGLE_API_KEY, callbacks=[tracer])
-        # Example usage of Gemini API
-        prompt_template = PromptTemplate(template="Start from here: ", input_variables=[])
-        chain = LLMChain(llm=llm, prompt=prompt_template)
-        gemini_response = chain.run({})  # Replace with your actual Gemini call
-
-        end_time = time.time()
-        latency = end_time - start_time
-
-        run.outputs = {"gemini_response": gemini_response}  # Log outputs
-        run.latency = latency # Log latency
-
-        # 2. End the Run (Important!)
-        run.end() # Mark the run as complete.
-
-        # 3. Evaluate and log the result
-        evaluation_result = evaluate_response(expected_response, gemini_response)
-        run.create_evaluation(evaluation_result) # Log the evaluation
-
-        return evaluation_result
-
-    except Exception as e:  # Handle exceptions
-        run.end(error=str(e))  # Log the error in LangSmith
-        print(f"Error during Gemini call or evaluation: {e}")
-        return None  # Or handle the error as needed
-
-def evaluate_response(expected_response, gemini_response):
-    """Performs the actual evaluation logic.  Customize this!"""
-
-    # Example 1: Exact match (simple, but often not realistic)
-    if expected_response.strip().lower() == gemini_response.strip().lower():
-        score = 1.0  # Perfect match
-        feedback = "Exact match!"
-    # Example 2: Keyword overlap (more flexible)
-    elif any(keyword in gemini_response.lower() for keyword in expected_response.lower().split()):
-        score = 0.7  # Partial match (adjust score as needed)
-        feedback = "Keyword overlap."
-    # Example 3: Semantic similarity (requires external library/API) -  Advanced!
-    # ... (Use a library like SentenceTransformers or an API for semantic similarity) ...
-    else:
-        score = 0.0
-        feedback = "No match."
-
-    # Create a LangSmith EvaluationResult object
-    evaluation_result = EvaluationResult(
-        score=score,
-        value=gemini_response, # The actual response being evaluated
-        comment=feedback,
-        # Other metadata you might want to add, like prompt or expected response
-        metadata = {"expected_response": expected_response} 
-    )
-
-    return evaluation_result
-
-
-
-# Example usage:
-prompt = "Translate 'Hello, world!' to French."
-expected_response = "Bonjour le monde !"
-gemini_response = "Bonjour monde !" # Or get the actual response from Gemini
-
-evaluation = evaluate_gemini_response(prompt, expected_response, gemini_response)
-print(f"Evaluation Score: {evaluation.score}, Feedback: {evaluation.comment}")
-
-# Another example
-prompt = "What is the capital of France?"
-expected_response = "Paris"
-gemini_response = "The capital of France is Paris."
-
-evaluation = evaluate_gemini_response(prompt, expected_response, gemini_response)
-print(f"Evaluation Score: {evaluation.score}, Feedback: {evaluation.comment}")
-
-wandb.finish()
--- a/scripts/llm-template/requirements.txt
+++ b/scripts/llm-template/requirements.txt
@ -1,6 +0,0 @@
-langchain
-google-generativeai
-chromadb
-SQLAlchemy
-weights&biases
-langsmith
--- a/scripts/llm-template/utils/data_loader.py
+++ b/scripts/llm-template/utils/data_loader.py
@ -1,3 +0,0 @@
-# data_loader.py
-
-# Add your data loading functions here
--- a/scripts/llm-template/utils/embedding_utils.py
+++ b/scripts/llm-template/utils/embedding_utils.py
@ -1,3 +0,0 @@
-# embedding_utils.py
-
-# Add your embedding utilities here