2025-02-22 13:58:52 +08:00

205 lines
8.9 KiB
Python

import json
import mlflow
import tempfile
import os
import wandb
import hydra
from omegaconf import DictConfig
from decouple import config
_steps = [
"get_documents",
"etl_chromdb_scanned_pdf",
"data_check",
"data_split",
"train_random_forest_propensity",
"train_random_forest_revenue",
"train_lasso_revenue",
# NOTE: We do not include this in the steps so it is not run by mistake.
# You first need to promote a model export to "prod" before you can run this,
# then you need to run this step explicitly
"test_model",
"test_production"
]
GEMINI_API_KEY = config("GOOGLE_API_KEY", cast=str)
# This automatically reads in the configuration
@hydra.main(config_name='config')
def go(config: DictConfig):
# Setup the wandb experiment. All runs will be grouped under this name
os.environ["WANDB_PROJECT"] = config["main"]["project_name"]
os.environ["WANDB_RUN_GROUP"] = config["main"]["experiment_name"]
# Steps to execute
steps_par = config['main']['steps']
active_steps = steps_par.split(",") if steps_par != "all" else _steps
# Move to a temporary directory
with tempfile.TemporaryDirectory() as tmp_dir:
if "get_documents" in active_steps:
# Download file and load in W&B
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "get_documents"),
"main",
parameters={
"document_folder": config["etl"]["document_folder"],
"path_document_folder": config["etl"]["path_document_folder"],
"artifact_name": "documents",
"artifact_type": "raw_data",
"artifact_description": "Raw file as downloaded"
},
)
if "etl_chromdb_scanned_pdf" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "etl_chromdb_scanned_pdf"),
"main",
parameters={
"input_artifact": "documents:latest",
"output_artifact": "chromdb.zip",
"output_type": "chromdb",
"output_description": "Scanned Documents in pdf to be read amd stored in chromdb",
"gemini_api_key": GEMINI_API_KEY,
},
)
if "data_check" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "data_check"),
"main",
parameters={
"csv": f"{config['data_check']['csv_to_check']}:latest",
"ref": "clean_sample.csv:reference",
"kl_threshold": config['data_check']['kl_threshold'],
"min_age": config['etl']['min_age'],
"max_age": config['etl']['max_age'],
"min_tenure": config['etl']['min_tenure'],
"max_tenure": config['etl']['max_tenure']
},
)
if "data_split" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "train_val_test_split"),
"main",
parameters={
"input": "clean_sample.csv:latest",
"test_size": config['modeling']['test_size'],
"random_seed": config['modeling']['random_seed'],
"stratify_by": config['modeling']['stratify_by'],
},
)
if "train_random_forest_propensity" in active_steps:
# NOTE: we need to serialize the random forest configuration into JSON
rf_config = os.path.abspath("rf_config.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest_classifier_propensity"].items()), fp) # DO NOT TOUCH
# NOTE: use the rf_config we created as the rf_config parameter for the train_random_forest
# step
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest_propensity"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"val_size": config['modeling']['val_size'],
"random_seed": config['modeling']['random_seed'],
"ls_output_columns": ','.join(config['modeling']['ls_output_columns']),
"product": config['modeling']['product_to_train'],
"stratify_by": config['modeling']['stratify_by'],
"n_folds": config['modeling']['n_folds'],
"rf_config": rf_config,
"output_artifact": "random_forest_export",
},
)
if "train_random_forest_revenue" in active_steps:
# NOTE: we need to serialize the random forest configuration into JSON
rf_config = os.path.abspath("rf_config_revenue.json")
with open(rf_config, "w+") as fp:
json.dump(dict(config["modeling"]["random_forest_regression_revenue"].items()), fp)
# NOTE: use the rf_config we created as the rf_config parameter for the train_random_forest
# step
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_random_forest_revenue"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"val_size": config['modeling']['val_size'],
"random_seed": config['modeling']['random_seed'],
"ls_output_columns": ','.join(config['modeling']['ls_output_columns']),
"product": config['modeling']['product_to_train'],
"stratify_by": config['modeling']['stratify_by'],
"n_folds": config['modeling']['n_folds'],
"rf_config": rf_config,
"output_artifact": "random_forest_export",
},
)
if "train_lasso_revenue" in active_steps:
# NOTE: use the lasso_config we created as the lasso_config parameter for the train_lasso
lasso_config = os.path.abspath("lasso_config.json")
with open(lasso_config, "w+") as fp:
json.dump(dict(config["modeling"]["lasso_regression_revenue"].items()), fp)
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "src", "train_lasso_revenue"),
"main",
parameters={
"trainval_artifact": "trainval_data.csv:latest",
"val_size": config['modeling']['val_size'],
"random_seed": config['modeling']['random_seed'],
"ls_output_columns": ','.join(config['modeling']['ls_output_columns']),
"product": config['modeling']['product_to_train'],
"stratify_by": config['modeling']['stratify_by'],
"n_folds": config['modeling']['n_folds'],
"lasso_config": lasso_config,
"output_artifact": "lasso_export",
},
)
if "test_model" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "test_model"),
"main",
parameters={
"model_propensity_cc": config['best_model_propensity']['propensity_cc'],
"model_propensity_cl": config['best_model_propensity']['propensity_cl'],
"model_propensity_mf": config['best_model_propensity']['propensity_mf'],
"model_revenue_cc": config['best_model_revenue']['revenue_cc'],
"model_revenue_cl": config['best_model_revenue']['revenue_cl'],
"model_revenue_mf": config['best_model_revenue']['revenue_mf'],
"test_dataset": "test_data.csv:latest",
},
)
if "test_production" in active_steps:
_ = mlflow.run(
os.path.join(hydra.utils.get_original_cwd(), "components", "test_production"),
"main",
parameters={
"model_propensity_cc": config['best_model_propensity']['propensity_cc'],
"model_propensity_cl": config['best_model_propensity']['propensity_cl'],
"model_propensity_mf": config['best_model_propensity']['propensity_mf'],
"model_revenue_cc": config['best_model_revenue']['revenue_cc'],
"model_revenue_cl": config['best_model_revenue']['revenue_cl'],
"model_revenue_mf": config['best_model_revenue']['revenue_mf'],
"test_dataset": f"{config['production']['test_csv']}:latest",
},
)
if __name__ == "__main__":
go()