14. MLflow

MLflow is YALF (yet another logging framework) for data science experiments. You can log just about anything important from your experiments including stats, performances, parameters, figures, intermediary results, models, etc. Many different data science frameworks (or flavors) are supported, including Scikit-Learn, PyTorch, XGBoost, etc. Since you can log the models (or persist the models), you can even use MLflow to depersist these models and make predictions! Let’s see how MLflow works.

Here, we create a classification problem with the typical Xy data shapes.

[1]:
from sklearn.datasets import make_classification
import pandas as pd

def get_Xy():
    n_features = 10
    X, y = make_classification(**{
        'n_samples': 2000,
        'n_features': n_features,
        'n_informative': 2,
        'n_redundant': 2,
        'n_repeated': 0,
        'n_classes': 2,
        'n_clusters_per_class': 1,
        'random_state': 37
    })

    cols = [f'x{i}' for i in range(n_features)]
    return pd.DataFrame(X, columns=cols), y

X, y = get_Xy()
X.shape, y.shape, X.columns
[1]:
((2000, 10),
 (2000,),
 Index(['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9'], dtype='object'))

Then, we split Xy into training X_tr, y_tr and testing X_te, y_te.

[2]:
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=37)

X_tr.shape, y_tr.shape, X_te.shape, y_te.shape
[2]:
((1400, 10), (1400,), (600, 10), (600,))

We are going to fit a logistic regression model and look at the performances for both training and testing sets.

[3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import average_precision_score, roc_auc_score, precision_score, recall_score, f1_score
import numpy as np

def get_model():
    estimator = LogisticRegression(**{
        'solver': 'saga',
        'penalty': 'l2',
        'random_state': 37,
        'max_iter': 1_000
    })

    cv = StratifiedKFold(**{
        'n_splits': 5,
        'shuffle': True,
        'random_state': 37
    })

    model = GridSearchCV(**{
        'estimator': estimator,
        'cv': cv,
        'param_grid': {
            'penalty': ['l1', 'l2'],
            'C': [0.01, 0.5, 0.7, 0.9]
        },
        'scoring': {
            'auc': 'roc_auc',
            'apr': 'average_precision'
        },
        'verbose': 5,
        'refit': 'apr',
        'error_score': np.NaN,
        'n_jobs': -1
    })

    return model

def get_performances(X_tr, y_tr, X_te, y_te, model):
    y_tr_pred = model.predict(X_tr)
    y_te_pred = model.predict(X_te)

    pre_tr = precision_score(y_tr, y_tr_pred)
    pre_te = precision_score(y_te, y_te_pred)

    rec_tr = recall_score(y_tr, y_tr_pred)
    rec_te = recall_score(y_te, y_te_pred)

    f1_tr = f1_score(y_tr, y_tr_pred)
    f1_te = f1_score(y_te, y_te_pred)

    y_tr_prob = model.predict_proba(X_tr)[:,1]
    y_te_prob = model.predict_proba(X_te)[:,1]

    auc_tr = roc_auc_score(y_tr, y_tr_prob)
    auc_te = roc_auc_score(y_te, y_te_prob)

    aps_tr = average_precision_score(y_tr, y_tr_prob)
    aps_te = average_precision_score(y_te, y_te_prob)

    perf_metrics = {
        'pre_tr': pre_tr,
        'pre_te': pre_te,
        'rec_tr': rec_tr,
        'rec_te': rec_te,
        'f1_tr': f1_tr,
        'f1_te': f1_te,
        'auc_tr': auc_tr,
        'auc_te': auc_te,
        'aps_tr': aps_tr,
        'aps_te': aps_te
    }

    return perf_metrics

Here is where MLflow comes in. As we progress through our experimental procedure, we can log nearly anything we want about the experiment. The tracking_uri points to the MLflow server and the experiment_name is the name of your experiment (many models can map to the same experiment, such as a set of random forest, logistic regression and gradient boosting models). Below, we log

  • the model,

  • the performances of the model,

  • the cross-validation results of the hyperparameter tuning grid search, and

  • a few graphs to visualize the data.

[4]:
import mlflow
from mlflow.exceptions import MlflowException
from mlflow.models.signature import infer_signature
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

experiment_name = 'test1'

if not mlflow.get_experiment_by_name(experiment_name):
    try:
        mlflow.create_experiment(experiment_name)
    except MlflowException as ex:
        print(f'{ex}')

tracking_uri = 'http://localhost:5001'

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(experiment_name)

with mlflow.start_run() as run:
    model = get_model()
    model.fit(X_tr, y_tr)

    signature = infer_signature(X_tr, model.predict_proba(X_tr))
    mlflow.sklearn.log_model(model, 'model', signature=signature)
    mlflow.log_params(model.best_params_)

    perf_metrics = get_performances(X_tr, y_tr, X_te, y_te, model)
    mlflow.log_metrics(perf_metrics)

    pd.DataFrame({k: model.cv_results_[k] for k in model.cv_results_ if k not in {'params'}}) \
        .to_csv('./_temp/cv-results.csv', index=False)
    mlflow.log_artifact('./_temp/cv-results.csv', 'artifact')

    temp = pd.concat([
        pd.DataFrame({'y': y_tr}).assign(fold='tr'),
        pd.DataFrame({'y': y_te}).assign(fold='te')
    ]).assign(n=1).groupby(['fold', 'y'])['n'].sum().to_frame().reset_index()
    fig, ax = plt.subplots()
    sns.barplot(x='fold', hue='y', y='n', data=temp, ax=ax)
    ax.set_title('Class Distributions')
    mlflow.log_figure(fig, 'fig/00-class-distributions.png')

    fig, ax = plt.subplots(1, 2, figsize=(10, 3.5))
    X_tr.plot(kind='kde', ax=ax[0], title='Feature Distributions, TR')
    X_te.plot(kind='kde', ax=ax[1], title='Feature Distributions, TE')
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/01-feature-distributions.png')

    fig, ax = plt.subplots(1, 2, figsize=(10, 3.5))
    sns.heatmap(X_tr.corr(), ax=ax[0])
    sns.heatmap(X_te.corr(), ax=ax[1])
    ax[0].set_title('Correlogram, TR')
    ax[1].set_title('Correlogram, TE')
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/02-correlograms.png')

    fig = sns.pairplot(X_tr.assign(y=y_tr), hue='y').fig
    mlflow.log_figure(fig, 'fig/03-00-tr-pairplot.png')

    fig = sns.pairplot(X_te.assign(y=y_te), hue='y').fig
    mlflow.log_figure(fig, 'fig/03-01-te-pairplot.png')

    fig, ax = plt.subplots(2, 1, figsize=(15, 5.5))
    pd.plotting.parallel_coordinates(X_tr.assign(y=y_tr), 'y', X_tr.columns, color=['#2e8ad8', '#cd3785'], sort_labels=True, axvlines=True, alpha=0.2, ax=ax[0])
    pd.plotting.parallel_coordinates(X_te.assign(y=y_te), 'y', X_tr.columns, color=['#2e8ad8', '#cd3785'], sort_labels=True, axvlines=True, alpha=0.2, ax=ax[1])
    ax[0].set_title('Parallel Coordinate, TR')
    ax[1].set_title('Parallel Coordinate, TE')
    plt.tight_layout()
    mlflow.log_figure(fig, 'fig/04-parallel-coordinate.png')

    run_id = mlflow.active_run().info.run_id
    print(f'run_id={run_id}')
Fitting 5 folds for each of 8 candidates, totalling 40 fits
2022/05/28 02:20:14 WARNING mlflow.utils.environment: Encountered an unexpected error while inferring pip requirements (model URI: /tmp/tmpsy1b58yj/model/model.pkl, flavor: sklearn), fall back to return ['scikit-learn==0.24.1', 'cloudpickle==1.6.0']. Set logging level to DEBUG to see the full traceback.
run_id=ae12b6f88a424d4783abb42b504f2bb7
_images/mlflow_7_3.png
_images/mlflow_7_4.png
_images/mlflow_7_5.png
_images/mlflow_7_6.png
_images/mlflow_7_7.png
_images/mlflow_7_8.png

Finally, we can load the model (using the run_id) and make predictions. Note how we persisted/logged the model using the sklearn flavor and also depersisted/loaded the model using the sklearn flavor again.

[6]:
loaded_model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')
loaded_model.predict_proba(X_tr)
[6]:
array([[0.62361146, 0.37638854],
       [0.93569093, 0.06430907],
       [0.16239568, 0.83760432],
       ...,
       [0.0796527 , 0.9203473 ],
       [0.90753055, 0.09246945],
       [0.0532899 , 0.9467101 ]])

We can also spawn hundreds of independent training jobs with different hyperparameters (not using scikit’s grid search) and log each of them to MLflow.

[10]:
path_tr = './_temp/tr.csv'
path_te = './_temp/te.csv'

X_tr.assign(y=y_tr).to_csv(path_tr, index=False)
X_te.assign(y=y_te).to_csv(path_te, index=False)
[28]:
from joblib import Parallel, delayed

def do_learn(penalty, C, path_tr, path_te, tracking_uri='http://localhost:5001', experiment_name='test2'):
    def get_Xy(path):
        df = pd.read_csv(path)
        y_col = 'y'
        X_cols = [c for c in df.columns if c != y_col]

        X, y = df[X_cols], df[y_col]
        return X, y

    X_tr, y_tr = get_Xy(path_tr)
    X_te, y_te = get_Xy(path_te)

    model_params = {
        'solver': 'saga',
        'penalty': penalty,
        'C': C,
        'random_state': 37,
        'max_iter': 1_000
    }
    model = LogisticRegression(**model_params)

    if not mlflow.get_experiment_by_name(experiment_name):
        try:
            mlflow.create_experiment(experiment_name)
        except MlflowException as ex:
            print(f'{ex}')

    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run() as run:
        model.fit(X_tr, y_tr)

        signature = infer_signature(X_tr, model.predict_proba(X_tr))
        mlflow.sklearn.log_model(model, 'model', signature=signature)
        mlflow.log_params(model_params)

        perf_metrics = get_performances(X_tr, y_tr, X_te, y_te, model)
        mlflow.log_metrics(perf_metrics)

        run_id = mlflow.active_run().info.run_id

    return run_id


size = 100
C = np.random.uniform(size=size)
P = np.random.uniform(size=size)
P = np.select([P < 0.5, P >= 0.5], ['l1', 'l2'])

run_ids = Parallel(n_jobs=-1)(delayed(do_learn)(p, c, path_tr, path_te) for p, c in zip(P, C))
print(run_ids)
['4b44a825df3a4a479a413fa5969db73d', '3cf7ab9496c243bf99f7159e3570118a', '1953685bd99e4c6ea7c9ccb054eee7cd', '9f84a08095b04136bd41ec43a36c038e', '5c5e69606bfc4a17ad17951d9e71e699', 'cc82e30810864e3db167cd993a9a93a5', '94a947fd86184fa8af1d02a7bc927065', '514c43c415a84637b64e7e0e852831b7', '15a9d5de992140c68d59f76f3e960f37', '9727bba9d8f248578bd13c493217e17f', '86b33b2bdd274227902ae59194aa3247', 'e7130921455640388278f92ddf599a84', '3e3492151163469aa1448fa57109db1b', '48d997b5d3674708ab5cb66bce21f767', '0c6a5f1ec98c4064bae68a4037680434', '78ea7dbfff124762b19796527df7d3ea', 'c412f93eeeb74aa5881e122b44e5f066', '623bb2a26dd744cd8b9c06af959d0e68', '011555648e6e4349bfb0b56c605b769d', 'c74c8daebbeb4062b9c561e59dac7eda', '40dae8a8fb9a495e924e48b0c5cb1117', 'f56529fef394418ca7888d5e268783d3', '2edeeb59f35f40e9aec6e66d3abe04c9', '78b6afca2c6d4497a18b92cb4422c4bd', '1931d4963b6341d6b995fc245ff3d42b', 'a93c2f7173f94b549913b7d4f2ed17de', 'a5b84397b3f34e8bab6cce66f171df83', '9881e992ba754310a4c048a5fd0175c4', '686be437b66c48d19e7e8160074e00eb', 'c038b25c59ee43339a515f541395547d', '8d7cae8317994c2a9406bb030968dc9d', 'f7882052620041f1ababcc60b26c98c3', '61d45b4eb6e840a594875707d0898971', '69f50a246c244d74ad5cf6b85fe483f3', '0eadfd2c706540f380b38f249db02589', 'cb5ab1b6472d401b91c397146508ecf3', '24e4a94e1800493d83a278c6dcf58886', 'a8cc5ff4f0f94746be7b180021debcfe', 'ec501330bff24e03887c6625b9e4230d', '73d06fe293794a44a7b6d915bc22d3da', 'f26a62b764f347f1a6969c3ce6ee0a2b', 'abe276323a4d466887696b946ded3db5', '5162825bd3a7426eb42b51d01fb35e70', '7b6dfb12feb84c929c76690694d64d56', '0cffadeb99044a33a3422bd73a046304', '1430eb8474514a5fb226f81739104e3c', '51d24c85d9934988b555794e83280d1d', '0f95d47b188f4a25b55e82a59187fa9c', '9a021acae1284677974e1b26ef35a21c', '8ee617aa9faa40ee92eeb575f0f01bc4', '27e191d655d24baa9533e2dc1a563561', 'dd5491d0010a4d9aa4d3ff4faddc1f33', 'afba4863360143a1841725a5e265aa6b', '8c750046d3f146adbc2370eba35afa64', '863ea7e694304610a2e2c98128bf2da8', 'de18890f47f246be80ee5b04767a2cfe', '569c3f719844453f831acf3b8626ac12', '59683d14ac9b439ebc411f681c2e9a67', '51573df0fb9a47ecb4c46e9bc9728902', 'fd34a5b4e7314611884e5b7677b79569', '230c156caa004a34b3f65793ad8c093e', '70e1020e91dd48b98e70a4dd9ac6185e', 'fccba53974a24eecbc2fd18438fdf551', '95c780fdb2a7406a837d0fc57634927a', 'c918e6b274a2420196765289ce4f9cec', '98d2124c64db46bc9717fb0d64afa602', 'c19f8545d8564975bdec4654a5dc1c11', '22ba3c251f724ca59a0a7ddb04c88479', 'd3f6924c0f5f410dbbb4baa3dd2de298', '4b751b0557a74103baa51f4d0bd8ec76', '2f392a19b07b402692b98f5ee9a39107', 'efef9b825e1746a883c0ae92b977db77', '2a898c0f68364e789375890ad54f6611', '2949a480136a4205899baad415cae1af', '7b063f4bcb874fd7b8a83e594ac52127', 'd9ba46081def41bc82a3478d299ceb10', 'b8b134632bbc476c97efc18376b9818e', '0c898f4713ee4a70a824d556e49c68ba', '8ad093566b474c35b6653e130278fe14', '85aa3749f0a948c8a90c11d86fec0987', 'a2f215e457ac4762a070b1843d39882e', 'fa059ec7116e482a854a0bdbd12c5692', 'a287b292fc67475290b525d8e9ff20e3', '61e43d866e7542a78bd9d2444c4a89b1', '806b00d52b634a139c51ab7362c9c7c4', '76aba9fbd1d64e63910dfcc881c79d77', '7df4d76ab7ab4b49a1efebc4ba0eb581', '64e5db3322a547b2976d2874e2e95ec8', '36fa6b0977394e44b34d2d8bf650dd0a', '4e5e4ffba90247ff9f215f274db80bda', '69546ea7050d4a53bde498b1725aaac6', '798fbb91e095417fad2ed86a5d2beea2', '21c23f487197471ea7826c0fe2df8d22', '553091345c5f4022b6603e0ca41b0f8d', 'ce67378fa92947ddbd395619fe5315de', 'bae016b49ade45bfb0ded139db5f8ba9', '1f9c2e5f46984272aa8fecf6800ae1c4', '1e033341ce5246618c36c9662be54b51', '71b1699097c94d5586455a83494afbb6', '1ef399eba3634096b981ca46f2c3e4f6']

We can then collect the parameters and performances and do modeling to see how the parameters impact the performances.

  • The regularization term C seems to be very important in average precision score and AUC.

  • The regularization approach l1=1 and l2=2 seems to be more important in F1 score.

[42]:
O = pd.DataFrame(map(lambda i: mlflow.get_run(i).data.metrics, run_ids))

I = pd.DataFrame(map(lambda i: mlflow.get_run(i).data.params, run_ids)) \
    .assign(penalty=lambda d: d['penalty'].apply(lambda v: 1 if v == 'l1' else 2)) \
    [['penalty', 'C']]
[50]:
from sklearn.ensemble import RandomForestRegressor

pd.Series(RandomForestRegressor().fit(I, O['aps_te']).feature_importances_, index=I.columns)
[50]:
penalty    0.265576
C          0.734424
dtype: float64
[51]:
pd.Series(RandomForestRegressor().fit(I, O['auc_te']).feature_importances_, index=I.columns)
[51]:
penalty    0.332347
C          0.667653
dtype: float64
[53]:
pd.Series(RandomForestRegressor().fit(I, O['f1_te']).feature_importances_, index=I.columns)
[53]:
penalty    0.698958
C          0.301042
dtype: float64
[ ]: