from typing import List
import os
import tempfile
from pathlib import Path
import pickle
from uuid import uuid4
import cloudpickle
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.exceptions import NotFittedError
from sklearn.utils.validation import check_is_fitted
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.pipeline import Pipeline as SKPipeline
from sklearn.preprocessing import (
FunctionTransformer,
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
)
from hycastle.lens.base import BaseLens
from hycastle.lens.transformers import DateTimeExploder, timedelta_as_hours
%matplotlib inline
HyMind Machine Learning
Imports
= 'secret'
secret_path 'EMAP_DB_USER'], os.environ['EMAP_DB_PASSWORD'] = Path(secret_path).read_text().strip().split('\n') os.environ[
from hylib.dt import LONDON_TZ
from hycastle.lens.icu import BournvilleICUSitRepLens
from hycastle.icu_store.live import live_dataset
from hycastle.icu_store.retro import retro_dataset
from hymind.lib.models.icu_aggregate import AggregateDemandModel
MLFlow Init
= os.getenv('HYMIND_REPO_TRACKING_URI')
mlflow_var mlflow.set_tracking_uri(mlflow_var)
= MlflowClient() client
Data
= retro_dataset('T03') df
df.shape
df.head()
Lens
# lens = BournvilleICUSitRepLens()
class DemoLens(BaseLens):
= True
numeric_output = "episode_slice_id"
index_col
@property
def input_cols(self) -> List[str]:
return [
"episode_slice_id",
"admission_age_years",
"avg_heart_rate_1_24h",
"max_temp_1_12h",
"avg_resp_rate_1_24h",
"elapsed_los_td",
"admission_dt",
"horizon_dt",
"n_inotropes_1_4h",
"wim_1",
"bay_type",
"sex",
"vent_type_1_4h",
]
def specify(self) -> ColumnTransformer:
return ColumnTransformer(
[
("select",
"passthrough",
["episode_slice_id",
"admission_age_years",
"n_inotropes_1_4h",
"wim_1",
],
),"bay_type_enc", OneHotEncoder(), ["bay_type"]),
(
("sex_enc",
OrdinalEncoder(="use_encoded_value", unknown_value=-1
handle_unknown
),"sex"],
[
),
("admission_dt_exp",
DateTimeExploder(),"admission_dt", "horizon_dt"],
[
),
("vent_type_1_4h_enc",
OrdinalEncoder(="use_encoded_value", unknown_value=-1
handle_unknown
),"vent_type_1_4h"],
[
),
("vitals_impute",
="mean", add_indicator=False),
SimpleImputer(strategy
["avg_heart_rate_1_24h",
"max_temp_1_12h",
"avg_resp_rate_1_24h",
],
),
("elapsed_los_td_hrs",
FunctionTransformer(timedelta_as_hours),"elapsed_los_td"],
[
),
] )
= DemoLens() lens
= lens.fit_transform(df) X
= df['discharged_in_48hr'].astype(int) y
= train_test_split(X, y, test_size=0.2) X_train, X_valid, y_train, y_valid
Dummy run
= RandomForestClassifier(n_jobs=-1, n_estimators=50, max_depth=2)
m %time m.fit(X_train.values, y_train.values.ravel())
Experiment
Utils
= Path('tmp')
tmp_path =True, exist_ok=True)
tmp_path.mkdir(parents
def mlflow_log_string(text, filename):
= tmp_path / filename
full_path with open(full_path, 'w') as f:
str(text))
f.write(
mlflow.log_artifact(full_path)
def mlflow_log_tag_dict(tag_dict, filename):
"""Logs tag dict to MLflow (while preserving order unlike mlflow.log_dict)"""
= tmp_path / filename
full_path with open(full_path, 'w') as f:
=False)
yaml.dump(tag_dict, f, sort_keys
mlflow.log_artifact(full_path)
def mlflow_log_lens(l):
= l.pickle(tmp_path)
full_path 'lens') mlflow.log_artifact(full_path,
# Owner|Type|Name|Date
= 'NS|models|jendemo|2021-10-05'
exp_name
"MLFLOW_EXPERIMENT_NAME"] = exp_name
os.environ[= mlflow.create_experiment(exp_name)
experiment_id
experiment_id
def artifact_path():
= Path(mlflow.get_artifact_uri())
pth =True, exist_ok=True)
pth.mkdir(parentsreturn pth
Parameter Grid
= {
grid 'n_estimators':[5, 10],
'max_depth':[2, 10]
}
Run
= 2
runs_per_param_set
for i in range(runs_per_param_set):
for g in ParameterGrid(grid):
= RandomForestClassifier(n_jobs=-1)
m
with mlflow.start_run():
#mlflow_logs()
**g)
m.set_params(
mlflow.log_params(g)
m.fit(X_train.values, y_train.values.ravel())
= pd.DataFrame({
eval_df 'predict_proba':m.predict_proba(X_valid.values)[:,1],
'label':y_valid.to_numpy().ravel()
}, =['predict_proba','label'])
columns
= m.score(X_train, y_train.to_numpy())
train_accuracy 'train_accuracy', train_accuracy)
mlflow.log_metric(= m.score(X_valid, y_valid.to_numpy())
valid_accuracy 'valid_accuracy', valid_accuracy)
mlflow.log_metric(
= confusion_matrix(m.predict(X_train.values), y_train.to_numpy())
train_confusion 'train_confusion.txt')
mlflow_log_string(train_confusion, = confusion_matrix(m.predict(X_valid.values), y_valid.to_numpy())
valid_confusion 'valid_confusion.txt')
mlflow_log_string(valid_confusion,
'model') mlflow.sklearn.log_model(m,
Select Best Run
= mlflow.search_runs()
runs runs.head()
= [col for col in runs if col.startswith('params')]
params = runs.groupby(params)['metrics.valid_accuracy'].mean().idxmax()
best_params = runs.set_index(keys=params).loc[best_params]
best_row
= list(best_row['run_id'])[0]
best_run_id best_run_id
Tag Best Run
with mlflow.start_run(run_id=best_run_id):
# tag the run as best_row
'best_run', 1) mlflow.set_tag(
Log Lens
with mlflow.start_run(run_id=best_run_id):
mlflow_log_lens(lens)
Register Model from Best Run for Deployment
= 'demo-model-jen'
model_name = 1 version
f'runs:/{best_run_id}/model', model_name) mlflow.register_model(
Simplified Inference Pathway
Find Registered Model
= client.get_model_version(model_name, version)
model_info model_info
= client.get_run(model_info.run_id)
run_info run_info
Load Model using Name & Version
= mlflow.sklearn.load_model(f'models:/{model_name}/{version}') model
model
Get logged Lens
with tempfile.TemporaryDirectory() as tmp:
= Path(tmp)
tmp_dir
'lens', tmp_dir)
client.download_artifacts(model_info.run_id,
= next((tmp_dir / 'lens').rglob('*.pkl'))
lens_path with open(lens_path, 'rb') as f:
= pickle.load(f) loaded_lens
loaded_lens
Predict Individual
= live_dataset('T03') live_df
'episode_slice_id', 'admission_dt', 'bed_code', 'avg_heart_rate_1_24h']].sort_values('admission_dt', ascending=False).head() live_df.loc[:, [
= loaded_lens.transform(live_df) X_df
= model.predict_proba(X_df) predictions
'prediction'] = predictions[:, 1] live_df[
live_df.loc[:, [‘episode_slice_id’, ‘prediction’]]