Source code for twaml.viz

# -*- coding: utf-8 -*-

"""twaml.viz

A module to aid visualizing our datasets

"""

from .data import dataset
import matplotlib as mpl

mpl.use("pdf")
import matplotlib.pyplot as plt
from typing import List, Optional, Tuple
import numpy as np
import math


[docs]def compare_distributions( dist1, dist2, bins: Optional["np.ndarray"] = None, titles: List[str] = ["dist1", "dist2"], colors: List[str] = ["C0", "C1"], ratio: bool = True, weight1: Optional["np.ndarray"] = None, weight2: Optional["np.ndarray"] = None, **subplots_kw, ): """Compare two histogrammed distributons with matplotlib Parameters ---------- dist1 any mpl-histogrammable object (``np.ndarray``, ``pd.Series``, etc.) dist2 any mpl-histogrammable object (``np.ndarray``, ``pd.Series``, etc.) bins: np.ndarray define the bin edges titles: List[str] labels for the distributions ratio: bool add a ratio plot weight1: Optional[np.ndarray] weights associated with dist1 weight2: Optional[np.ndarray] weights associated with dist2 subplots_kw: Dict all additional keywords to send to ``matplotlib.pyplot.subplots`` Returns ------- fig : matpotlib.figure.Figure ax : matplotlib.axes.Axes or array of them *ax* can be either a single matplotlib.axes.Axes object or an array of Axes objects if more than one subplot was created. The dimensions of the resulting array can be controlled with the squeeze keyword, see above. h1 the return of ``matplotlib.axes.Axes.hist`` for dist1 h2 the return of ``matplotlib.axes.Axes.hist`` for dist2 """ if ratio: fig, ax = plt.subplots( 2, 1, sharex=True, gridspec_kw=dict(height_ratios=[3, 1], hspace=0.025), **subplots_kw, ) h1 = ax[0].hist( dist1, bins=bins, histtype="step", label=titles[0], color=colors[0], weights=weight1, ) h2 = ax[0].hist( dist2, bins=h1[1], histtype="step", label=titles[1], color=colors[1], weights=weight2, ) centers = (h1[1][:-1] + h1[1][1:]) / 2.0 ax[1].plot(centers, h1[0] / h2[0], "k-") ax[1].plot([centers[0] - 10e3, centers[1] + 10e3], np.ones(2), "k--") ax[1].set_ylim([0, 2]) ax[1].set_xlim([h1[1][0], h1[1][-1]]) else: fig, ax = plt.subplots(**subplots_kw) h1 = ax.hist(dist1, bins=bins, histtype="step", label=titles[0], color=colors[0]) h2 = ax.hist(dist2, bins=h1[1], histtype="step", label=titles[1], color=colors[1]) ax.set_xlim([h1[1][0], h1[1][-1]]) return fig, ax, h1, h2
[docs]def compare_columns( ds1: dataset, ds2: dataset, columns: Optional[List[str]] = None, names: Optional[Tuple[str, str]] = None, colors: Optional[Tuple[str, str]] = None, density=True, **subplots_kw, ): """generate a set of histograms comparing the distributions of a set of columns in two different datasets. Parameters ---------- ds1: twaml.data.dataset The first dataset ds2: twaml.data.dataset The second dataset columns: Optional[List[str]] Columns to plot; if None, plot all names: Optional[Tuple[str,str]] Names for the legend, if None use the dataset ``name`` attributes colors: Optional[Tuple[str,str]] Colors for the histograms density: bool Feed to ``density`` parameter in ``matplotlib.pyplot.hist`` subplots_kw: Dict all additional keywords to send to ``matplotlib.pyplot.subplots`` """ col1 = list(ds1.df.columns) col2 = list(ds2.df.columns) if columns is None: assert col1 == col2, "different columns" cols = col1 else: cols = columns for c in cols: assert c in col1, f"{c} column not in ds1" assert c in col2, f"{c} column not in ds2" if names is None: names = (ds1.name, ds2.name) if colors is None: colors = ("C0", "C1") dim = math.sqrt(len(cols)) dim1 = int(dim) if dim1 > dim: dim2 = dim1 + 1 else: dim2 = dim1 fig, axs = plt.subplots(dim1, dim2) w1, w2 = ds1.weights, ds2.weights for col, a in zip(cols, axs.flatten()): dist1 = ds1.df[col].to_numpy() dist2 = ds2.df[col].to_numpy() print(dist1.dtype, dist2.dtype) xmin = min(np.min(dist1), np.min(dist2)) xmax = max(np.max(dist1), np.max(dist1)) if ( dist1.dtype == np.dtype("i4") or dist1.dtype == np.dtype("i8") or dist1.dtype == np.dtype("u4") or dist1.dtype == np.dtype("u8") ): nbins = xmax - xmin xmax = xmax + 0.5 xmin = xmin - 0.5 else: nbins = 50 a.hist( [dist1, dist2], bins=np.linspace(xmin, xmax, nbins + 1), weights=[w1, w2], density=density, histtype="step", ) fig.savefig(f"{ds1.name}_{ds2.name}.pdf")