# ___________________________________________________________________________
#
# Pyomo: Python Optimization Modeling Objects
# Copyright (c) 2008-2024
# National Technology and Engineering Solutions of Sandia, LLC
# Under the terms of Contract DE-NA0003525 with National Technology and
# Engineering Solutions of Sandia, LLC, the U.S. Government retains certain
# rights in this software.
# This software is distributed under the 3-clause BSD License.
# ___________________________________________________________________________
import itertools
from pyomo.common.dependencies import (
matplotlib,
matplotlib_available,
numpy as np,
numpy_available,
pandas as pd,
pandas_available,
scipy,
scipy_available,
check_min_version,
attempt_import,
)
from pyomo.common.dependencies.matplotlib import pyplot as plt
from pyomo.common.dependencies.scipy import stats
# occasionally dependent conda packages for older distributions
# (e.g. python 3.5) get released that are either broken not
# compatible, resulting in a SyntaxError
sns, seaborn_available = attempt_import(
"seaborn", catch_exceptions=(ImportError, SyntaxError)
)
imports_available = (
numpy_available
& scipy_available
& pandas_available
& matplotlib_available
& seaborn_available
)
def _get_variables(ax, columns):
sps = ax.get_subplotspec()
nx = sps.get_geometry()[1]
ny = sps.get_geometry()[0]
cell = sps.get_geometry()[2]
xloc = int(np.mod(cell, nx))
yloc = int(np.mod((cell - xloc) / nx, ny))
xvar = columns[xloc]
yvar = columns[yloc]
# print(sps.get_geometry(), cell, xloc, yloc, xvar, yvar)
return xvar, yvar, (xloc, yloc)
def _get_XYgrid(x, y, ncells):
xlin = np.linspace(
min(x) - abs(max(x) - min(x)) / 2, max(x) + abs(max(x) - min(x)) / 2, ncells
)
ylin = np.linspace(
min(y) - abs(max(y) - min(y)) / 2, max(y) + abs(max(y) - min(y)) / 2, ncells
)
X, Y = np.meshgrid(xlin, ylin)
return X, Y
def _get_data_slice(xvar, yvar, columns, data, theta_star):
search_ranges = {}
for var in columns:
if var in [xvar, yvar]:
search_ranges[var] = data[var].unique()
else:
search_ranges[var] = [theta_star[var]]
data_slice = pd.DataFrame(
list(itertools.product(*search_ranges.values())), columns=search_ranges.keys()
)
# griddata will not work with linear interpolation if the data
# values are constant in any dimension
for col in data[columns].columns:
cv = data[col].std() / data[col].mean() # Coefficient of variation
if cv < 1e-8:
temp = data.copy()
# Add variation (the interpolation is later scaled)
if cv == 0:
temp[col] = temp[col] + data[col].mean() / 10
else:
temp[col] = temp[col] + data[col].std()
data = pd.concat([data, temp], ignore_index=True)
data_slice["obj"] = scipy.interpolate.griddata(
np.array(data[columns]),
np.array(data[["obj"]]),
np.array(data_slice[columns]),
method="linear",
rescale=True,
)
X = data_slice[xvar]
Y = data_slice[yvar]
Z = data_slice["obj"]
return X, Y, Z
# Note: seaborn 0.11 no longer expects color and label to be passed to the
# plotting functions. label is kept here for backward compatibility
def _add_scatter(x, y, color, columns, theta_star, label=None):
ax = plt.gca()
xvar, yvar, loc = _get_variables(ax, columns)
ax.scatter(theta_star[xvar], theta_star[yvar], c=color, s=35)
def _add_rectangle_CI(x, y, color, columns, lower_bound, upper_bound, label=None):
ax = plt.gca()
xvar, yvar, loc = _get_variables(ax, columns)
xmin = lower_bound[xvar]
ymin = lower_bound[yvar]
xmax = upper_bound[xvar]
ymax = upper_bound[yvar]
ax.plot([xmin, xmax], [ymin, ymin], color=color)
ax.plot([xmax, xmax], [ymin, ymax], color=color)
ax.plot([xmax, xmin], [ymax, ymax], color=color)
ax.plot([xmin, xmin], [ymax, ymin], color=color)
def _add_scipy_dist_CI(
x, y, color, columns, ncells, alpha, dist, theta_star, label=None
):
ax = plt.gca()
xvar, yvar, loc = _get_variables(ax, columns)
X, Y = _get_XYgrid(x, y, ncells)
data_slice = []
if isinstance(dist, stats._multivariate.multivariate_normal_frozen):
for var in theta_star.index:
if var == xvar:
data_slice.append(X)
elif var == yvar:
data_slice.append(Y)
elif var not in [xvar, yvar]:
data_slice.append(np.array([[theta_star[var]] * ncells] * ncells))
data_slice = np.dstack(tuple(data_slice))
elif isinstance(dist, stats.gaussian_kde):
for var in theta_star.index:
if var == xvar:
data_slice.append(X.ravel())
elif var == yvar:
data_slice.append(Y.ravel())
elif var not in [xvar, yvar]:
data_slice.append(np.array([theta_star[var]] * ncells * ncells))
data_slice = np.array(data_slice)
else:
return
Z = dist.pdf(data_slice)
Z = Z.reshape((ncells, ncells))
ax.contour(X, Y, Z, levels=[alpha], colors=color)
def _add_obj_contour(x, y, color, columns, data, theta_star, label=None):
ax = plt.gca()
xvar, yvar, loc = _get_variables(ax, columns)
try:
X, Y, Z = _get_data_slice(xvar, yvar, columns, data, theta_star)
triang = matplotlib.tri.Triangulation(X, Y)
cmap = matplotlib.colormaps["Greys"]
plt.tricontourf(triang, Z, cmap=cmap)
except:
print("Objective contour plot for", xvar, yvar, "slice failed")
def _set_axis_limits(g, axis_limits, theta_vals, theta_star):
if theta_star is not None:
theta_vals = pd.concat([theta_vals, theta_star], ignore_index=True)
if axis_limits is None:
axis_limits = {}
for col in theta_vals.columns:
theta_range = np.abs(theta_vals[col].max() - theta_vals[col].min())
if theta_range < 1e-10:
theta_range = theta_vals[col].max() / 10
axis_limits[col] = [
theta_vals[col].min() - theta_range / 4,
theta_vals[col].max() + theta_range / 4,
]
for ax in g.fig.get_axes():
xvar, yvar, (xloc, yloc) = _get_variables(ax, theta_vals.columns)
if xloc != yloc: # not on diagonal
ax.set_ylim(axis_limits[yvar])
ax.set_xlim(axis_limits[xvar])
else: # on diagonal
ax.set_xlim(axis_limits[xvar])
[docs]def pairwise_plot(
theta_values,
theta_star=None,
alpha=None,
distributions=[],
axis_limits=None,
title=None,
add_obj_contour=True,
add_legend=True,
filename=None,
):
"""
Plot pairwise relationship for theta values, and optionally alpha-level
confidence intervals and objective value contours
Parameters
----------
theta_values: DataFrame or tuple
* If theta_values is a DataFrame, then it contains one column for each theta variable
and (optionally) an objective value column ('obj') and columns that contains
Boolean results from confidence interval tests (labeled using the alpha value).
Each row is a sample.
* Theta variables can be computed from ``theta_est_bootstrap``,
``theta_est_leaveNout``, and ``leaveNout_bootstrap_test``.
* The objective value can be computed using the ``likelihood_ratio_test``.
* Results from confidence interval tests can be computed using the
``leaveNout_bootstrap_test``, ``likelihood_ratio_test``, and
``confidence_region_test``.
* If theta_values is a tuple, then it contains a mean, covariance, and number
of samples (mean, cov, n) where mean is a dictionary or Series
(indexed by variable name), covariance is a DataFrame (indexed by
variable name, one column per variable name), and n is an integer.
The mean and covariance are used to create a multivariate normal
sample of n theta values. The covariance can be computed using
``theta_est(calc_cov=True)``.
theta_star: dict or Series, optional
Estimated value of theta. The dictionary or Series is indexed by variable name.
Theta_star is used to slice higher dimensional contour intervals in 2D
alpha: float, optional
Confidence interval value, if an alpha value is given and the
distributions list is empty, the data will be filtered by True/False
values using the column name whose value equals alpha (see results from
``leaveNout_bootstrap_test``, ``likelihood_ratio_test``, and
``confidence_region_test``)
distributions: list of strings, optional
Statistical distribution used to define a confidence region,
options = 'MVN' for multivariate_normal, 'KDE' for gaussian_kde, and
'Rect' for rectangular.
Confidence interval is a 2D slice, using linear interpolation at theta_star.
axis_limits: dict, optional
Axis limits in the format {variable: [min, max]}
title: string, optional
Plot title
add_obj_contour: bool, optional
Add a contour plot using the column 'obj' in theta_values.
Contour plot is a 2D slice, using linear interpolation at theta_star.
add_legend: bool, optional
Add a legend to the plot
filename: string, optional
Filename used to save the figure
"""
assert isinstance(theta_values, (pd.DataFrame, tuple))
assert isinstance(theta_star, (type(None), dict, pd.Series, pd.DataFrame))
assert isinstance(alpha, (type(None), int, float))
assert isinstance(distributions, list)
assert set(distributions).issubset(set(["MVN", "KDE", "Rect"]))
assert isinstance(axis_limits, (type(None), dict))
assert isinstance(title, (type(None), str))
assert isinstance(add_obj_contour, bool)
assert isinstance(filename, (type(None), str))
# If theta_values is a tuple containing (mean, cov, n), create a DataFrame of values
if isinstance(theta_values, tuple):
assert len(theta_values) == 3
mean = theta_values[0]
cov = theta_values[1]
n = theta_values[2]
if isinstance(mean, dict):
mean = pd.Series(mean)
theta_names = mean.index
mvn_dist = stats.multivariate_normal(mean, cov)
theta_values = pd.DataFrame(
mvn_dist.rvs(n, random_state=1), columns=theta_names
)
assert theta_values.shape[0] > 0
if isinstance(theta_star, dict):
theta_star = pd.Series(theta_star)
if isinstance(theta_star, pd.DataFrame):
theta_star = theta_star.loc[0, :]
theta_names = [
col
for col in theta_values.columns
if (col not in ["obj"])
and (not isinstance(col, float))
and (not isinstance(col, int))
]
# Filter data by alpha
if (alpha in theta_values.columns) and (len(distributions) == 0):
thetas = theta_values.loc[theta_values[alpha] == True, theta_names]
else:
thetas = theta_values[theta_names]
if theta_star is not None:
theta_star = theta_star[theta_names]
legend_elements = []
g = sns.PairGrid(thetas)
# Plot histogram on the diagonal
# Note: distplot is deprecated and will be removed in a future
# version of seaborn, use histplot. distplot is kept for older
# versions of python.
if check_min_version(sns, "0.11"):
g.map_diag(sns.histplot)
else:
g.map_diag(sns.distplot, kde=False, hist=True, norm_hist=False)
# Plot filled contours using all theta values based on obj
if "obj" in theta_values.columns and add_obj_contour:
g.map_offdiag(
_add_obj_contour,
columns=theta_names,
data=theta_values,
theta_star=theta_star,
)
# Plot thetas
g.map_offdiag(plt.scatter, s=10)
legend_elements.append(
matplotlib.lines.Line2D(
[0],
[0],
marker="o",
color="w",
label="thetas",
markerfacecolor="cadetblue",
markersize=5,
)
)
# Plot theta*
if theta_star is not None:
g.map_offdiag(
_add_scatter, color="k", columns=theta_names, theta_star=theta_star
)
legend_elements.append(
matplotlib.lines.Line2D(
[0],
[0],
marker="o",
color="w",
label="theta*",
markerfacecolor="k",
markersize=6,
)
)
# Plot confidence regions
colors = ["r", "mediumblue", "darkgray"]
if (alpha is not None) and (len(distributions) > 0):
if theta_star is None:
print(
"""theta_star is not defined, confidence region slice will be
plotted at the mean value of theta"""
)
theta_star = thetas.mean()
mvn_dist = None
kde_dist = None
for i, dist in enumerate(distributions):
if dist == "Rect":
lb, ub = fit_rect_dist(thetas, alpha)
g.map_offdiag(
_add_rectangle_CI,
color=colors[i],
columns=theta_names,
lower_bound=lb,
upper_bound=ub,
)
legend_elements.append(
matplotlib.lines.Line2D([0], [0], color=colors[i], lw=1, label=dist)
)
elif dist == "MVN":
mvn_dist = fit_mvn_dist(thetas)
Z = mvn_dist.pdf(thetas)
score = stats.scoreatpercentile(Z, (1 - alpha) * 100)
g.map_offdiag(
_add_scipy_dist_CI,
color=colors[i],
columns=theta_names,
ncells=100,
alpha=score,
dist=mvn_dist,
theta_star=theta_star,
)
legend_elements.append(
matplotlib.lines.Line2D([0], [0], color=colors[i], lw=1, label=dist)
)
elif dist == "KDE":
kde_dist = fit_kde_dist(thetas)
Z = kde_dist.pdf(thetas.transpose())
score = stats.scoreatpercentile(Z, (1 - alpha) * 100)
g.map_offdiag(
_add_scipy_dist_CI,
color=colors[i],
columns=theta_names,
ncells=100,
alpha=score,
dist=kde_dist,
theta_star=theta_star,
)
legend_elements.append(
matplotlib.lines.Line2D([0], [0], color=colors[i], lw=1, label=dist)
)
_set_axis_limits(g, axis_limits, thetas, theta_star)
for ax in g.axes.flatten():
ax.ticklabel_format(style="sci", scilimits=(-2, 2), axis="both")
if add_legend:
xvar, yvar, loc = _get_variables(ax, theta_names)
if loc == (len(theta_names) - 1, 0):
ax.legend(handles=legend_elements, loc="best", prop={"size": 8})
if title:
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle(title)
# Work in progress
# Plot lower triangle graphics in separate figures, useful for presentations
lower_triangle_only = False
if lower_triangle_only:
for ax in g.axes.flatten():
xvar, yvar, (xloc, yloc) = _get_variables(ax, theta_names)
if xloc < yloc: # lower triangle
ax.remove()
ax.set_xlabel(xvar)
ax.set_ylabel(yvar)
fig = plt.figure()
ax.figure = fig
fig.axes.append(ax)
fig.add_axes(ax)
f, dummy = plt.subplots()
bbox = dummy.get_position()
ax.set_position(bbox)
dummy.remove()
plt.close(f)
ax.tick_params(reset=True)
if add_legend:
ax.legend(handles=legend_elements, loc="best", prop={"size": 8})
plt.close(g.fig)
if filename is None:
plt.show()
else:
plt.savefig(filename)
plt.close()
[docs]def fit_rect_dist(theta_values, alpha):
"""
Fit an alpha-level rectangular distribution to theta values
Parameters
----------
theta_values: DataFrame
Theta values, columns = variable names
alpha: float, optional
Confidence interval value
Returns
---------
tuple containing lower bound and upper bound for each variable
"""
assert isinstance(theta_values, pd.DataFrame)
assert isinstance(alpha, (int, float))
tval = stats.t.ppf(1 - (1 - alpha) / 2, len(theta_values) - 1) # Two-tail
m = theta_values.mean()
s = theta_values.std()
lower_bound = m - tval * s
upper_bound = m + tval * s
return lower_bound, upper_bound
[docs]def fit_mvn_dist(theta_values):
"""
Fit a multivariate normal distribution to theta values
Parameters
----------
theta_values: DataFrame
Theta values, columns = variable names
Returns
---------
scipy.stats.multivariate_normal distribution
"""
assert isinstance(theta_values, pd.DataFrame)
dist = stats.multivariate_normal(
theta_values.mean(), theta_values.cov(), allow_singular=True
)
return dist
[docs]def fit_kde_dist(theta_values):
"""
Fit a Gaussian kernel-density distribution to theta values
Parameters
----------
theta_values: DataFrame
Theta values, columns = variable names
Returns
---------
scipy.stats.gaussian_kde distribution
"""
assert isinstance(theta_values, pd.DataFrame)
dist = stats.gaussian_kde(theta_values.transpose().values)
return dist
def _get_grouped_data(data1, data2, normalize, group_names):
if normalize:
data_median = data1.median()
data_std = data1.std()
data1 = (data1 - data_median) / data_std
data2 = (data2 - data_median) / data_std
# Combine data1 and data2 to create a grouped histogram
data = pd.concat({group_names[0]: data1, group_names[1]: data2})
data.reset_index(level=0, inplace=True)
data.rename(columns={"level_0": "set"}, inplace=True)
data = data.melt(id_vars="set", value_vars=data1.columns, var_name="columns")
return data
[docs]def grouped_boxplot(
data1, data2, normalize=False, group_names=["data1", "data2"], filename=None
):
"""
Plot a grouped boxplot to compare two datasets
The datasets can be normalized by the median and standard deviation of data1.
Parameters
----------
data1: DataFrame
Data set, columns = variable names
data2: DataFrame
Data set, columns = variable names
normalize : bool, optional
Normalize both datasets by the median and standard deviation of data1
group_names : list, optional
Names used in the legend
filename: string, optional
Filename used to save the figure
"""
assert isinstance(data1, pd.DataFrame)
assert isinstance(data2, pd.DataFrame)
assert isinstance(normalize, bool)
assert isinstance(group_names, list)
assert isinstance(filename, (type(None), str))
data = _get_grouped_data(data1, data2, normalize, group_names)
plt.figure()
sns.boxplot(data=data, hue="set", y="value", x="columns", order=data1.columns)
plt.gca().legend().set_title("")
plt.gca().set_xlabel("")
plt.gca().set_ylabel("")
if filename is None:
plt.show()
else:
plt.savefig(filename)
plt.close()
[docs]def grouped_violinplot(
data1, data2, normalize=False, group_names=["data1", "data2"], filename=None
):
"""
Plot a grouped violinplot to compare two datasets
The datasets can be normalized by the median and standard deviation of data1.
Parameters
----------
data1: DataFrame
Data set, columns = variable names
data2: DataFrame
Data set, columns = variable names
normalize : bool, optional
Normalize both datasets by the median and standard deviation of data1
group_names : list, optional
Names used in the legend
filename: string, optional
Filename used to save the figure
"""
assert isinstance(data1, pd.DataFrame)
assert isinstance(data2, pd.DataFrame)
assert isinstance(normalize, bool)
assert isinstance(group_names, list)
assert isinstance(filename, (type(None), str))
data = _get_grouped_data(data1, data2, normalize, group_names)
plt.figure()
sns.violinplot(
data=data, hue="set", y="value", x="columns", order=data1.columns, split=True
)
plt.gca().legend().set_title("")
plt.gca().set_xlabel("")
plt.gca().set_ylabel("")
if filename is None:
plt.show()
else:
plt.savefig(filename)
plt.close()