Source code for pyomo.contrib.parmest.graphics

# ____________________________________________________________________________________
#
# Pyomo: Python Optimization Modeling Objects
# Copyright (c) 2008-2026 National Technology and Engineering Solutions of Sandia, LLC
# Under the terms of Contract DE-NA0003525 with National Technology and Engineering
# Solutions of Sandia, LLC, the U.S. Government retains certain rights in this
# software.  This software is distributed under the 3-clause BSD License.
# ____________________________________________________________________________________

import itertools
from pyomo.common.dependencies import (
    matplotlib,
    matplotlib_available,
    numpy as np,
    numpy_available,
    pandas as pd,
    pandas_available,
    scipy,
    scipy_available,
    check_min_version,
    attempt_import,
)
from pyomo.common.dependencies.matplotlib import pyplot as plt
from pyomo.common.dependencies.scipy import stats

# occasionally dependent conda packages for older distributions
# (e.g. python 3.5) get released that are either broken not
# compatible, resulting in a SyntaxError
sns, seaborn_available = attempt_import(
    "seaborn", catch_exceptions=(ImportError, SyntaxError)
)

imports_available = (
    numpy_available
    & scipy_available
    & pandas_available
    & matplotlib_available
    & seaborn_available
)


def _get_variables(ax, columns):
    sps = ax.get_subplotspec()
    nx = sps.get_geometry()[1]
    ny = sps.get_geometry()[0]
    cell = sps.get_geometry()[2]
    xloc = int(np.mod(cell, nx))
    yloc = int(np.mod((cell - xloc) / nx, ny))

    xvar = columns[xloc]
    yvar = columns[yloc]
    # print(sps.get_geometry(), cell, xloc, yloc, xvar, yvar)

    return xvar, yvar, (xloc, yloc)


def _get_XYgrid(x, y, ncells):
    xlin = np.linspace(
        min(x) - abs(max(x) - min(x)) / 2, max(x) + abs(max(x) - min(x)) / 2, ncells
    )
    ylin = np.linspace(
        min(y) - abs(max(y) - min(y)) / 2, max(y) + abs(max(y) - min(y)) / 2, ncells
    )
    X, Y = np.meshgrid(xlin, ylin)

    return X, Y


def _get_data_slice(xvar, yvar, columns, data, theta_star):
    search_ranges = {}
    for var in columns:
        if var in [xvar, yvar]:
            search_ranges[var] = data[var].unique()
        else:
            search_ranges[var] = [theta_star[var]]

    data_slice = pd.DataFrame(
        list(itertools.product(*search_ranges.values())), columns=search_ranges.keys()
    )

    # griddata will not work with linear interpolation if the data
    # values are constant in any dimension
    for col in data[columns].columns:
        cv = data[col].std() / data[col].mean()  # Coefficient of variation
        if cv < 1e-8:
            temp = data.copy()
            # Add variation (the interpolation is later scaled)
            if cv == 0:
                temp[col] = temp[col] + data[col].mean() / 10
            else:
                temp[col] = temp[col] + data[col].std()
            data = pd.concat([data, temp], ignore_index=True)

    data_slice["obj"] = scipy.interpolate.griddata(
        np.array(data[columns]),
        np.array(data[["obj"]]),
        np.array(data_slice[columns]),
        method="linear",
        rescale=True,
    )

    X = data_slice[xvar]
    Y = data_slice[yvar]
    Z = data_slice["obj"]

    return X, Y, Z


# Note: seaborn 0.11 no longer expects color and label to be passed to the
#       plotting functions. label is kept here for backward compatibility
def _add_scatter(x, y, color, columns, theta_star, label=None):
    ax = plt.gca()
    xvar, yvar, loc = _get_variables(ax, columns)

    ax.scatter(theta_star[xvar], theta_star[yvar], c=color, s=35)


def _add_rectangle_CI(x, y, color, columns, lower_bound, upper_bound, label=None):
    ax = plt.gca()
    xvar, yvar, loc = _get_variables(ax, columns)

    xmin = lower_bound[xvar]
    ymin = lower_bound[yvar]
    xmax = upper_bound[xvar]
    ymax = upper_bound[yvar]

    ax.plot([xmin, xmax], [ymin, ymin], color=color)
    ax.plot([xmax, xmax], [ymin, ymax], color=color)
    ax.plot([xmax, xmin], [ymax, ymax], color=color)
    ax.plot([xmin, xmin], [ymax, ymin], color=color)


def _add_scipy_dist_CI(
    x, y, color, columns, ncells, alpha, dist, theta_star, label=None
):
    ax = plt.gca()
    xvar, yvar, loc = _get_variables(ax, columns)

    X, Y = _get_XYgrid(x, y, ncells)

    data_slice = []

    if isinstance(dist, stats._multivariate.multivariate_normal_frozen):
        for var in theta_star.index:
            if var == xvar:
                data_slice.append(X)
            elif var == yvar:
                data_slice.append(Y)
            elif var not in [xvar, yvar]:
                data_slice.append(np.array([[theta_star[var]] * ncells] * ncells))
        data_slice = np.dstack(tuple(data_slice))

    elif isinstance(dist, stats.gaussian_kde):
        for var in theta_star.index:
            if var == xvar:
                data_slice.append(X.ravel())
            elif var == yvar:
                data_slice.append(Y.ravel())
            elif var not in [xvar, yvar]:
                data_slice.append(np.array([theta_star[var]] * ncells * ncells))
        data_slice = np.array(data_slice)
    else:
        return

    Z = dist.pdf(data_slice)
    Z = Z.reshape((ncells, ncells))

    ax.contour(X, Y, Z, levels=[alpha], colors=color)


def _add_obj_contour(x, y, color, columns, data, theta_star, label=None):
    ax = plt.gca()
    xvar, yvar, loc = _get_variables(ax, columns)

    try:
        X, Y, Z = _get_data_slice(xvar, yvar, columns, data, theta_star)

        triang = matplotlib.tri.Triangulation(X, Y)
        cmap = matplotlib.colormaps["Greys"]

        plt.tricontourf(triang, Z, cmap=cmap)
    except:
        print("Objective contour plot for", xvar, yvar, "slice failed")


def _set_axis_limits(g, axis_limits, theta_vals, theta_star):
    if theta_star is not None:
        theta_vals = pd.concat([theta_vals, theta_star], ignore_index=True)

    if axis_limits is None:
        axis_limits = {}
        for col in theta_vals.columns:
            theta_range = np.abs(theta_vals[col].max() - theta_vals[col].min())
            if theta_range < 1e-10:
                theta_range = theta_vals[col].max() / 10
            axis_limits[col] = [
                theta_vals[col].min() - theta_range / 4,
                theta_vals[col].max() + theta_range / 4,
            ]
    for ax in g.fig.get_axes():
        xvar, yvar, (xloc, yloc) = _get_variables(ax, theta_vals.columns)
        if xloc != yloc:  # not on diagonal
            ax.set_ylim(axis_limits[yvar])
            ax.set_xlim(axis_limits[xvar])
        else:  # on diagonal
            ax.set_xlim(axis_limits[xvar])



[docs]
def pairwise_plot(
    theta_values,
    theta_star=None,
    alpha=None,
    distributions=[],
    axis_limits=None,
    title=None,
    add_obj_contour=True,
    add_legend=True,
    filename=None,
    seed=None,
):
    """
    Plot pairwise relationship for theta values, and optionally alpha-level
    confidence intervals and objective value contours

    Parameters
    ----------
    theta_values: DataFrame or tuple

        * If theta_values is a DataFrame, then it contains one column for each theta variable
          and (optionally) an objective value column ('obj') and columns that contains
          Boolean results from confidence interval tests (labeled using the alpha value).
          Each row is a sample.

            * Theta variables can be computed from ``theta_est_bootstrap``,
              ``theta_est_leaveNout``, and  ``leaveNout_bootstrap_test``.
            * The objective value can be computed using the ``likelihood_ratio_test``.
            * Results from confidence interval tests can be computed using the
              ``leaveNout_bootstrap_test``, ``likelihood_ratio_test``, and
              ``confidence_region_test``.

        * If theta_values is a tuple, then it contains a mean, covariance, and number
          of samples (mean, cov, n) where mean is a dictionary or Series
          (indexed by variable name), covariance is a DataFrame (indexed by
          variable name, one column per variable name), and n is an integer.
          The mean and covariance are used to create a multivariate normal
          sample of n theta values. The covariance can be computed using
          ``theta_est(calc_cov=True)``.

    theta_star: dict or Series, optional
        Estimated value of theta.  The dictionary or Series is indexed by variable name.
        Theta_star is used to slice higher dimensional contour intervals in 2D
    alpha: float, optional
        Confidence interval value, if an alpha value is given and the
        distributions list is empty, the data will be filtered by True/False
        values using the column name whose value equals alpha (see results from
        ``leaveNout_bootstrap_test``, ``likelihood_ratio_test``, and
        ``confidence_region_test``)
    distributions: list of strings, optional
        Statistical distribution used to define a confidence region,
        options = 'MVN' for multivariate_normal, 'KDE' for gaussian_kde, and
        'Rect' for rectangular.
        Confidence interval is a 2D slice, using linear interpolation at theta_star.
    axis_limits: dict, optional
        Axis limits in the format {variable: [min, max]}
    title: string, optional
        Plot title
    add_obj_contour: bool, optional
        Add a contour plot using the column 'obj' in theta_values.
        Contour plot is a 2D slice, using linear interpolation at theta_star.
    add_legend: bool, optional
        Add a legend to the plot
    filename: string, optional
        Filename used to save the figure
    seed: int, optional
        Random seed used to generate theta values if theta_values is a tuple.
        If None, the seed is not set.
    """
    assert isinstance(theta_values, (pd.DataFrame, tuple))
    assert isinstance(theta_star, (type(None), dict, pd.Series, pd.DataFrame))
    assert isinstance(alpha, (type(None), int, float))
    assert isinstance(distributions, list)
    assert set(distributions).issubset(set(["MVN", "KDE", "Rect"]))
    assert isinstance(axis_limits, (type(None), dict))
    assert isinstance(title, (type(None), str))
    assert isinstance(add_obj_contour, bool)
    assert isinstance(filename, (type(None), str))

    if seed is not None:
        np.random.seed(seed)

    # If theta_values is a tuple containing (mean, cov, n), create a DataFrame of values
    if isinstance(theta_values, tuple):
        assert len(theta_values) == 3
        mean = theta_values[0]
        cov = theta_values[1]
        n = theta_values[2]
        if isinstance(mean, dict):
            mean = pd.Series(mean)
        theta_names = mean.index
        mvn_dist = stats.multivariate_normal(mean, cov, seed=seed)
        theta_values = pd.DataFrame(
            mvn_dist.rvs(n, random_state=1), columns=theta_names
        )

    assert theta_values.shape[0] > 0

    if isinstance(theta_star, dict):
        theta_star = pd.Series(theta_star)
    if isinstance(theta_star, pd.DataFrame):
        theta_star = theta_star.loc[0, :]

    theta_names = [
        col
        for col in theta_values.columns
        if (col not in ["obj"])
        and (not isinstance(col, float))
        and (not isinstance(col, int))
    ]

    # Filter data by alpha
    if (alpha in theta_values.columns) and (len(distributions) == 0):
        thetas = theta_values.loc[theta_values[alpha] == True, theta_names]
    else:
        thetas = theta_values[theta_names]

    if theta_star is not None:
        theta_star = theta_star[theta_names]

    legend_elements = []

    g = sns.PairGrid(thetas)

    # Plot histogram on the diagonal
    # Note: distplot is deprecated and will be removed in a future
    #       version of seaborn, use histplot.  distplot is kept for older
    #       versions of python.
    if check_min_version(sns, "0.11"):
        g.map_diag(sns.histplot)
    else:
        g.map_diag(sns.distplot, kde=False, hist=True, norm_hist=False)

    # Plot filled contours using all theta values based on obj
    if "obj" in theta_values.columns and add_obj_contour:
        g.map_offdiag(
            _add_obj_contour,
            columns=theta_names,
            data=theta_values,
            theta_star=theta_star,
        )

    # Plot thetas
    g.map_offdiag(plt.scatter, s=10)
    legend_elements.append(
        matplotlib.lines.Line2D(
            [0],
            [0],
            marker="o",
            color="w",
            label="thetas",
            markerfacecolor="cadetblue",
            markersize=5,
        )
    )

    # Plot theta*
    if theta_star is not None:
        g.map_offdiag(
            _add_scatter, color="k", columns=theta_names, theta_star=theta_star
        )

        legend_elements.append(
            matplotlib.lines.Line2D(
                [0],
                [0],
                marker="o",
                color="w",
                label="theta*",
                markerfacecolor="k",
                markersize=6,
            )
        )

    # Plot confidence regions
    colors = ["r", "mediumblue", "darkgray"]
    if (alpha is not None) and (len(distributions) > 0):
        if theta_star is None:
            print("""theta_star is not defined, confidence region slice will be 
                  plotted at the mean value of theta""")
            theta_star = thetas.mean()

        mvn_dist = None
        kde_dist = None
        for i, dist in enumerate(distributions):
            if dist == "Rect":
                lb, ub = fit_rect_dist(thetas, alpha)
                g.map_offdiag(
                    _add_rectangle_CI,
                    color=colors[i],
                    columns=theta_names,
                    lower_bound=lb,
                    upper_bound=ub,
                )
                legend_elements.append(
                    matplotlib.lines.Line2D([0], [0], color=colors[i], lw=1, label=dist)
                )

            elif dist == "MVN":
                mvn_dist = fit_mvn_dist(thetas, seed=seed)
                Z = mvn_dist.pdf(thetas)
                score = stats.scoreatpercentile(Z, (1 - alpha) * 100)
                g.map_offdiag(
                    _add_scipy_dist_CI,
                    color=colors[i],
                    columns=theta_names,
                    ncells=100,
                    alpha=score,
                    dist=mvn_dist,
                    theta_star=theta_star,
                )
                legend_elements.append(
                    matplotlib.lines.Line2D([0], [0], color=colors[i], lw=1, label=dist)
                )

            elif dist == "KDE":
                kde_dist = fit_kde_dist(thetas, seed=seed)
                Z = kde_dist.pdf(thetas.transpose())
                score = stats.scoreatpercentile(Z, (1 - alpha) * 100)
                g.map_offdiag(
                    _add_scipy_dist_CI,
                    color=colors[i],
                    columns=theta_names,
                    ncells=100,
                    alpha=score,
                    dist=kde_dist,
                    theta_star=theta_star,
                )
                legend_elements.append(
                    matplotlib.lines.Line2D([0], [0], color=colors[i], lw=1, label=dist)
                )

    _set_axis_limits(g, axis_limits, thetas, theta_star)

    for ax in g.axes.flatten():
        ax.ticklabel_format(style="sci", scilimits=(-2, 2), axis="both")

        if add_legend:
            xvar, yvar, loc = _get_variables(ax, theta_names)
            if loc == (len(theta_names) - 1, 0):
                ax.legend(handles=legend_elements, loc="best", prop={"size": 8})
    if title:
        g.fig.subplots_adjust(top=0.9)
        g.fig.suptitle(title)

    # Work in progress
    # Plot lower triangle graphics in separate figures, useful for presentations
    lower_triangle_only = False
    if lower_triangle_only:
        for ax in g.axes.flatten():
            xvar, yvar, (xloc, yloc) = _get_variables(ax, theta_names)
            if xloc < yloc:  # lower triangle
                ax.remove()

                ax.set_xlabel(xvar)
                ax.set_ylabel(yvar)

                fig = plt.figure()
                ax.figure = fig
                fig.axes.append(ax)
                fig.add_axes(ax)

                f, dummy = plt.subplots()
                bbox = dummy.get_position()
                ax.set_position(bbox)
                dummy.remove()
                plt.close(f)

                ax.tick_params(reset=True)

                if add_legend:
                    ax.legend(handles=legend_elements, loc="best", prop={"size": 8})

        plt.close(g.fig)

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
        plt.close()




[docs]
def fit_rect_dist(theta_values, alpha):
    """
    Fit an alpha-level rectangular distribution to theta values

    Parameters
    ----------
    theta_values: DataFrame
        Theta values, columns = variable names
    alpha: float, optional
        Confidence interval value

    Returns
    ---------
    tuple containing lower bound and upper bound for each variable
    """
    assert isinstance(theta_values, pd.DataFrame)
    assert isinstance(alpha, (int, float))

    tval = stats.t.ppf(1 - (1 - alpha) / 2, len(theta_values) - 1)  # Two-tail
    m = theta_values.mean()
    s = theta_values.std()
    lower_bound = m - tval * s
    upper_bound = m + tval * s

    return lower_bound, upper_bound




[docs]
def fit_mvn_dist(theta_values, seed=None):
    """
    Fit a multivariate normal distribution to theta values

    Parameters
    ----------
    theta_values: DataFrame
        Theta values, columns = variable names

    Returns
    ---------
    scipy.stats.multivariate_normal distribution
    """
    assert isinstance(theta_values, pd.DataFrame)

    if seed is not None:
        np.random.seed(seed)

    dist = stats.multivariate_normal(
        theta_values.mean(), theta_values.cov(), allow_singular=True, seed=seed
    )

    return dist




[docs]
def fit_kde_dist(theta_values, seed=None):
    """
    Fit a Gaussian kernel-density distribution to theta values

    Parameters
    ----------
    theta_values: DataFrame
        Theta values, columns = variable names

    Returns
    ---------
    scipy.stats.gaussian_kde distribution
    """
    assert isinstance(theta_values, pd.DataFrame)
    if seed is not None:
        np.random.seed(seed)

    dist = stats.gaussian_kde(theta_values.transpose().values)

    return dist



def _get_grouped_data(data1, data2, normalize, group_names):
    if normalize:
        data_median = data1.median()
        data_std = data1.std()
        data1 = (data1 - data_median) / data_std
        data2 = (data2 - data_median) / data_std

    # Combine data1 and data2 to create a grouped histogram
    data = pd.concat({group_names[0]: data1, group_names[1]: data2})
    data.reset_index(level=0, inplace=True)
    data.rename(columns={"level_0": "set"}, inplace=True)

    data = data.melt(id_vars="set", value_vars=data1.columns, var_name="columns")

    return data



[docs]
def grouped_boxplot(
    data1, data2, normalize=False, group_names=["data1", "data2"], filename=None
):
    """
    Plot a grouped boxplot to compare two datasets

    The datasets can be normalized by the median and standard deviation of data1.

    Parameters
    ----------
    data1: DataFrame
        Data set, columns = variable names
    data2: DataFrame
        Data set, columns = variable names
    normalize : bool, optional
        Normalize both datasets by the median and standard deviation of data1
    group_names : list, optional
        Names used in the legend
    filename: string, optional
        Filename used to save the figure
    """
    assert isinstance(data1, pd.DataFrame)
    assert isinstance(data2, pd.DataFrame)
    assert isinstance(normalize, bool)
    assert isinstance(group_names, list)
    assert isinstance(filename, (type(None), str))

    data = _get_grouped_data(data1, data2, normalize, group_names)

    plt.figure()
    sns.boxplot(data=data, hue="set", y="value", x="columns", order=data1.columns)

    plt.gca().legend().set_title("")
    plt.gca().set_xlabel("")
    plt.gca().set_ylabel("")

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
        plt.close()




[docs]
def grouped_violinplot(
    data1, data2, normalize=False, group_names=["data1", "data2"], filename=None
):
    """
    Plot a grouped violinplot to compare two datasets

    The datasets can be normalized by the median and standard deviation of data1.

    Parameters
    ----------
    data1: DataFrame
        Data set, columns = variable names
    data2: DataFrame
        Data set, columns = variable names
    normalize : bool, optional
        Normalize both datasets by the median and standard deviation of data1
    group_names : list, optional
        Names used in the legend
    filename: string, optional
        Filename used to save the figure
    """
    assert isinstance(data1, pd.DataFrame)
    assert isinstance(data2, pd.DataFrame)
    assert isinstance(normalize, bool)
    assert isinstance(group_names, list)
    assert isinstance(filename, (type(None), str))

    data = _get_grouped_data(data1, data2, normalize, group_names)

    plt.figure()
    sns.violinplot(
        data=data, hue="set", y="value", x="columns", order=data1.columns, split=True
    )

    plt.gca().legend().set_title("")
    plt.gca().set_xlabel("")
    plt.gca().set_ylabel("")

    if filename is None:
        plt.show()
    else:
        plt.savefig(filename)
        plt.close()