Source code for idq.classifiers.ovl

import multiprocessing as mp
import time as TIME

import numpy as np
import h5py

from ligo.segments import segment, segmentlist
from gwpy.segments import SegmentList, DataQualityFlag, DataQualityDict

from .. import exceptions
from .. import features
from .. import hookimpl
from .. import plots
from .. import utils
from .. import io
from ..io.reporters.hdf5 import HDF5Reporter
from ..series import SeriesInfo
from . import SupervisedClassifier, ClassifierModel, DEFAULT_DT


DEFAULT_NUM_PROC = 1
# how long we sleep between checking that mp.map_async results have finished
DEFAULT_MULTIPROCESSING_SLEEP = 0.001
DEFAULT_MULTIPROCESSING_CHUNKSIZE = None


[docs]class Vetolist(ClassifierModel):
    """
    a representation of the OVL and DOVL vetolist
    this is stored as the internal model within OVL and DOVL

    *WRITE ME*
        decribe protections of attributes and their meanings
            _default_thresholds
            _default_windows
            _table_dtypes
            _repr_head
            _repr_body
    """

    _default_thresholds = [15, 20, 25, 30, 35, 45, 55, 100, 500, 1000]
    _default_windows = [0.500, 0.100, 0.050, 0.025, 0.010, 0.005, 0.001]

    _table_dtypes = [
        ("channel", "|S75"),
        ("threshold", "float"),
        ("window", "float"),
        ("num_glitch", "float"),
        ("num_clean", "float"),  # like livetime
        ("num_aux", "float"),
        ("num_vetod_exp", "float"),
        ("num_vetod_gch", "float"),
        ("num_vetod_cln", "float"),  # like dsec
        ("poisson_signif", "float"),
        ("use_percentage", "float"),
        ("eff", "float"),
        ("fap", "float"),
        ("eff_fap", "float"),
        ("cnum_vetod_gch", "float"),
        ("cnum_vetod_cln", "float"),
        ("rank", "float"),
    ]

    _repr_body = " ".join(
        "%-40s" if "|S" in dtype[1] else "%15.9f" for dtype in _table_dtypes
    )  # NOTE: this may be fragile...
    _repr_head = (
        " ".join(
            "%-40s" if dtype[0] == "channel" else "%15s" for dtype in _table_dtypes
        )
    ) % tuple(
        dtype[0] for dtype in _table_dtypes
    )  # NOTE: this may be fragile...

    def __init__(
        self,
        start,
        end,
        segs=None,
        model_id=None,
        generate_id=False,
        channels=None,
        thresholds=None,
        windows=None,
        significance=None,
        **kwargs,
    ):
        ClassifierModel.__init__(
            self, start, end, segs=segs, model_id=model_id, generate_id=generate_id
        )

        if not channels:
            channels = []
        if not thresholds:
            thresholds = self._default_thresholds
        if not windows:
            windows = self._default_windows

        self._channels = channels
        self._thresholds = thresholds
        self._windows = windows
        self.significance = significance

        # set up table
        self.init_table()

        # select how features are extracted
        self.selector = features.Selector(
            self.relevant_channels,
            bounds={
                self.significance: (self.minimum_threshold, np.inf),
                **kwargs.get("bounds", {}),
            },
        )

    @property
    def channels(self):
        return self._channels

    @channels.setter
    def channels(self, channels):
        for chan in channels:
            assert isinstance(chan, str), "channels must be strings"
        self._channels = channels

    @property
    def relevant_channels(self):
        return set(channel.decode("utf-8") for channel in self._table["channel"])

    @property
    def thresholds(self):
        return self._thresholds

    @thresholds.setter
    def thresholds(self, thresholds):
        for thr in thresholds:
            assert isinstance(
                thr, (int, float)
            ), "thresholds must be integers or floats"
        self._thresholds = thresholds

    @property
    def relevant_thresholds(self):
        return set(self._table["threshold"])

    @property
    def minimum_threshold(self):
        return np.min(self["threshold"]) if len(self["threshold"]) else np.infty

    @property
    def windows(self):
        return self._windows

    @windows.setter
    def windows(self, windows):
        for win in windows:
            assert isinstance(win, (int, float)), "windows must be integers or floats"
        self._windows = windows

    @property
    def relevant_windows(self):
        return set(self._table["window"])

    def __iter__(self):
        """
        returns a generator that will iterate over the internal table
        """
        for line in self._table:
            yield line

    def __getitem__(self, index):
        return self._table[index]

    def __len__(self):
        return len(self._table)

    def __repr__(self):
        return str(self._table)

    def __str__(self):
        """
        makes a nice ASCII formatted table instead of just printing the
        structured array "as is", which is what __repr__ does
        """
        s = [self._repr_head]
        for configuration in self:
            s.append(self._repr_body % tuple(configuration))
        return "\n".join(s)

[docs]    def dump(self, path, nickname=None, **kwargs):
        """
        write this object into path should contain all info, not just the
        configuration order Uses hdf5 format, and asserts that the path must end
        in ".hdf5"

        we can add attributes to this file via the kwargs specified in this
        function's signature

        WARNING: we may want to add default attrs to the hdf5 file, like the
        code's version and the git repo location, etc. we also may want to add
        information about how this file was produced, but that may be optional
        and can be specified via kwargs

        NOTE: we'll likely use an instance of HDF5Reporter rather than this
        method
        """
        assert path.endswith(".hdf5"), "path=%s must end with .hdf5" % (path)

        if nickname is None:
            nickname = "vetolist"
        with h5py.File(path, "w") as file_obj:
            dataset = file_obj.create_dataset(nickname, data=self._table)
            dataset.attrs.update(kwargs)

[docs]    def load(self, path, nickname=None, verbose=True):
        """
        load the data from file into memory
        will load all info, not just hte configuration order
        Uses hdf5 format, and asserts that the path must end in ".hdf5"

        if verbose: print attrs from dataset

        NOTE:
            we'll likely use an instance of HDF5Reporter rather than this method
        """
        assert path.endswith(".hdf5"), "path=%s must end with .hdf5" % (path)

        with h5py.File(path, "r") as file_obj:
            if nickname is None:
                keys = list(file_obj.keys())
                assert (
                    len(keys) == 1
                ), "exactly 1 key required in the HDF5 file unless a nickname is chosen"
                # expect a single key and just use that this makes it a bit more
                # independent of who wrote this to disk
                dataset = file_obj[keys[0]]
            else:
                dataset = file_obj[nickname]
            # extract the numpy array from the data set
            self._table = dataset[...]
            # update which channels are present
            self.channels = set(
                channel.decode("utf-8") for channel in self.table["channel"]
            )
            self.windows = set(self.table["window"])
            self.thresholds = set(self.table["threshold"])

            if verbose:
                for k, v in dataset.attrs.items():
                    print("%s\t%s" % (k, v))

[docs]    def init_table(self):
        """
        instantiate the table used to store data internally
        """
        self._table = []
        for channel in self.channels:
            for threshold in self.thresholds:
                for window in self.windows:
                    self._table.append(
                        (channel, threshold, window)
                        + (np.nan,) * (len(self._table_dtypes) - 3)
                    )
        # NOTE: we don't delegate to set_table because we don't need to check
        # the table for consistency...
        self._table = np.array(self._table, dtype=self._table_dtypes)

    @property
    def table(self):
        return self._table

    @table.setter
    def table(self, table):
        """
        WARNING: do not use this unless you really know what you're doing!
        """
        assert isinstance(table, np.ndarray), "table is not a numpy.ndarray!"

        assert len(table.dtype) == len(
            self._table_dtypes
        ), "new table has the wrong number of columns!"
        for a, b in zip(table.dtype.names, self._table_dtypes):
            assert a == b[0], "ordering of new table is inconsistent with Vetolist!"
            assert table.dtype[a] == b[1], (
                "types of new table are inconsistent with Vetolist! "
                "%s (new) vs %s (old)" % (table.dtype[a], b[1])
            )

        channels = self.channels
        for channel in set(table["channel"]):
            assert (
                channel.decode("utf-8") in channels
            ), "new table has inconsistent channels with this Vetolist!"

        thresholds = self.thresholds
        for threshold in set(table["threshold"]):
            assert (
                threshold in thresholds
            ), "new table has inconsistent thresholds with this Vetolist!"

        windows = self.windows
        for window in set(table["window"]):
            assert (
                window in windows
            ), "new table has inconsistent windows with this Vetolist!"

        self._table = table

[docs]    def metric2rank(self, value, metric="eff_fap", **kwargs):
        """
        map our metric value into a rank we define a somewhat ad-hoc monotonic
        function with a separate scale for each metric
        """
        if metric == "use_percentage":
            return self._use_percentage_to_rank(value)
        elif metric == "poisson_signif":
            return self._poisson_signif_to_rank(value)
        elif metric == "eff_fap":
            return self._eff_fap_to_rank(value)
        else:
            raise ValueError("metric=%s not understood" % metric)

    @staticmethod
    def _use_percentage_to_rank(val):
        """
        map use_percentage metric to a rank between 0 and 1.
        """
        return val

    @staticmethod
    def _poisson_signif_to_rank(val):
        """
        map poisson_signif metric to a rank between 0 and 1.
        """
        score = 0.009 * np.power(val, 1.5)
        return score / (1 + score)

    @staticmethod
    def _eff_fap_to_rank(val):
        """
        map eff_fap metric to a rank between 0 and 1.
        """
        score = 0.02 * val
        return score / (1 + score)

[docs]    def reorder(self, metric="eff_fap", metric_weights=None, **kwargs):
        """
        re-order the vetolist according to this metric
        we always sort so that bigger numbers show up first
        """
        # assign ranks
        if metric == "combined":
            metric_names = ("eff_fap", "use_percentage", "poisson_signif")
            if metric_weights:
                weights = [metric_weights[metric] for metric in metric_names]
            else:
                # use equal weights
                weights = [1 / 3.0, 1 / 3.0, 1 / 3.0]
            metrics = np.array(
                [self.metric2rank(self.table[m], metric=m) for m in metric_names]
            )
            self.table["rank"] = np.average(metrics, axis=0, weights=weights)
        else:
            self.table["rank"] = self.metric2rank(self.table[metric], metric=metric)

        # reorder - sort by metric, then by threshold, then by window size
        # NOTE: the old implementation of OVL did not re-order based on windows;
        # it only used the thresholds and then the metric. this can produce
        # slightly different orderings, but they probably will be statistically
        # equivalent regarding performance
        order = [
            (ind, config["window"], config["threshold"], config["rank"])
            for ind, config in enumerate(self)
        ]
        order.sort(key=lambda x: (x[3], x[2], -x[1]), reverse=True)
        self.table = self.table[[veto[0] for veto in order]]

[docs]    def prune(self, minimum, metric="eff_fap", **kwargs):
        """
        remove all configurations with metric<minimum

        WARNING:

         * this will cause data to be forgotten in an unrecoverable way!
         * only use this if you are certain you know what you're doing!
        """
        self.table = self.table[self.table[metric] >= minimum]

[docs]    def redundancies(self, dataset, time, significance, verbose=False, Verbose=False):
        """
        computes the intersection and overlap of vetosegments for each possible
        configuration. This should contain all information necessary to
        determine which channels are redundant.

        we only return information based on the current model, which may have
        trained itself down to a subset of the total possible configurations

        returns table, headers table is a matrix of the livetime of the
        intersections of veto segments from each configuration pair headers is a
        list of the (channel, threshold, window) tuples, with the same order as
        the columns in table
        """
        verbose |= Verbose

        # set up the data product
        N = len(self)
        table = np.empty((N, N), dtype=float)
        headers = self[["channel", "threshold", "window"]]

        if verbose:
            print("caching triggers within dataset")
        if not dataset.is_configured():
            dataset.configure(self.selector)
        dataset.load_data()

        # iterate over configurations
        if verbose:
            print("computing aux segments")
        auxsegs = []
        for config in self:
            # compute segments once for each configuration to avoid repeating
            # work in the nested loop
            segs = self._configuration2segments(config, dataset, time, significance)
            auxsegs.append(dataset.segs & segs)

        if verbose:
            print("computing overlap between veto configurations")
        tmp = "%d - %d"
        for i, config1 in enumerate(self):
            if Verbose:
                print(tmp % (i, i))
            segs1 = auxsegs[i]
            # livetime of segments intersected with themselves
            table[i, i] = utils.livetime(segs1)

            # iterate over the rest of the configurations and fill in the
            # intersections
            for j, config2 in enumerate(self[i + 1 :], 1):
                if Verbose:
                    print(tmp % (i, j))
                segs2 = auxsegs[i + j]
                table[i, i + j] = table[i + j, i] = utils.livetime(segs1 & segs2)

        # return data
        return table, headers

[docs]    def nonredundant_segments(self, dataset, time, significance):
        """
        Checks which configurations are redundant and then returns
        the indices of the non-redundant configurations, and their segments
        """
        intersect_table, _ = self.redundancies(dataset, time, significance)
        segments = self.segments(dataset, time, significance)

        keep_inds = []
        for i, veto_segs in enumerate(segments):
            # live time is element (i,i) of intersect_table
            # check if veto was active
            if intersect_table[i, i] == 0:
                continue

            # check if redundant with any previous veto
            # redundant if same channel, and segments are a subset of other vetoes
            keep = True
            for j, other_segs in enumerate(segments[:i]):
                # check if channels are the same
                if self.table[i]["channel"] != self.table[j]["channel"]:
                    continue

                # veto i is redundant with veto j if livetime
                # of their intersection equals i's livetime
                if intersect_table[i, i] == intersect_table[i, j]:
                    keep = False
                    break

            if keep:
                keep_inds.append(i)

        return np.array(keep_inds), [segments[i] for i in keep_inds]

[docs]    def save_as_data_quality_flags(
        self, out_path, dataset, time, significance, remove_redundant=True
    ):
        """
        Saves the veto list as a set of data quality flags in an hdf5 file
        """

        if remove_redundant:
            inds, configuration_segments = self.nonredundant_segments(
                dataset, time, significance
            )
            if len(inds) > 0:
                configurations = self[inds]
            else:
                configurations = np.array([])
        else:
            configurations = self[:]
            configuration_segments = self.segments(dataset, time, significance)

        veto_dict = DataQualityDict()
        lookup_dict = {}
        pairs = zip(configurations, configuration_segments)
        for i, (configuration, aux_segs) in enumerate(pairs):
            channel = configuration["channel"].decode("utf-8")
            threshold = configuration["threshold"]
            window = configuration["window"]
            name = f"{channel}_{threshold}_{window}:1"
            active = SegmentList(aux_segs)
            known = dataset.segs
            dq_flag = DataQualityFlag(
                name=name, active=active, known=known, isgood=False
            )
            veto_dict[name] = dq_flag
            lookup_dict[name] = configuration

        veto_dict.write(out_path)

        with h5py.File(out_path, "r+") as f:
            for name, grp in f.items():
                configuration = lookup_dict[name]
                grp.attrs["channel"] = configuration["channel"].decode("utf-8")
                grp.attrs["threshold"] = configuration["threshold"]
                grp.attrs["window"] = configuration["window"]
                grp.attrs["rank"] = configuration["rank"]
                grp.attrs["eff"] = configuration["eff"]
                grp.attrs["fap"] = configuration["fap"]
                grp.attrs["use_percentage"] = configuration["use_percentage"]
                grp.attrs["poisson_signif"] = configuration["poisson_signif"]

[docs]    def feature_importance(self):
        """
        should essentially be an ordered list of veto configurations
        although this is really a structured numpy array
        """
        return self.table[["channel"], ["threshold"], ["window"]]

[docs]    def feature_importance_figure(
        self,
        dataset,
        start,
        end,
        t0,
        time,
        significance,
        colormap=plots.DEFAULT_COLORMAP,
        nonerow=True,
        nonecolor=None,
        verbose=False,
        **kwargs,
    ):
        """generate and return a figure demonstrating the feature importance
        based on the data within dataset; should return a figure
        object."""
        # go dig up data first because we dynamically size the figure based on
        # the number of rows grab what we need from the dataset factory
        if verbose:
            print("retrieving triggers")

        if not dataset.is_configured():
            dataset.configure(self.selector)
        dataset.load_data()

        if verbose:
            print("setting up plotting options")
        cmap = plots.plt.get_cmap(colormap)
        dylim = 0.10
        low = 0.01  # used for plotting
        height = 1 - 2 * low
        # yticklabel template
        tmp = (
            "%50s\n"
            + r"$\mathrm{"
            + significance
            + r"}\geq%.1f$, "
            + r"$|\Delta\mathrm{"
            + time
            + r"}|\leq%.3f s$, $\mathrm{rank}=%.3f$"
        )

        # map linewidth to time span
        figwidth = 10
        seg_linewidth = 0.25
        linewidth_t = seg_linewidth / ((figwidth * 72) / (end - start))

        rows = []
        ticklabels = []  # used for labeling
        ticks = []
        count = 0
        segs = segmentlist([segment(start, end)])
        none_segments = segmentlist(
            [segment(start, end)]
        )  # used to keep track of times when nothing was active
        if nonecolor is None:
            nonecolor = cmap(0)
        else:
            nonecolor = nonecolor

        if verbose:
            print("iterating over configurations")

        configuration_inds, configuration_segs = self.nonredundant_segments(
            dataset, time, significance
        )
        if len(configuration_inds) > 0:
            configurations = self[configuration_inds]
        else:
            configurations = np.array([])

        reverse_order = zip(configurations[::-1], configuration_segs[::-1])
        for configuration, aux_segs in reverse_order:
            # put most important things at the top of the figure
            if verbose:
                print("    %s" % configuration)
            # go find the segs for this channel
            aux_segs = segs & aux_segs
            if utils.livetime(aux_segs):
                # there is some time when this was active
                # expand segments to a minimum size t depending on linewidth
                aux_segs = utils.expand_segments(aux_segs, linewidth_t)

                if nonerow:
                    # there is a "None" row, so remove these from the none_segments
                    none_segments -= aux_segs
                else:
                    # no "None" row, so plot a background bar
                    rows.append(
                        (
                            (
                                [start - t0, end - t0],
                                [low + count] * 2,
                                [low + count + height] * 2,
                            ),
                            dict(color=nonecolor, edgecolor="w", linewidth=1.00),
                        )
                    )

                # plot the active segments
                color = cmap(configuration["rank"])
                for seg in aux_segs:
                    rows.append(
                        (
                            (
                                np.array(seg) - t0,
                                [low + count] * 2,
                                [low + count + height] * 2,
                            ),
                            dict(color=color, edgecolor=color, linewidth=seg_linewidth),
                        )
                    )
                    # NOTE: we play with edgecolors and linewidths here to make
                    # sure all segments are visible (linewidth is effectively a
                    # minimum width)

                ticklabels.append(
                    tmp
                    % (
                        configuration["channel"].decode("utf-8"),
                        configuration["threshold"],
                        configuration["window"],
                        configuration["rank"],
                    )
                )
                ticks.append(count + 0.5)
                count += 1

        if nonerow or (count == 0):
            # there is a None row or there are no active configs
            ticklabels.insert(0, "No Veto")
            ticks.insert(0, -0.5)
            # plot the none segments
            for seg in none_segments:
                rows.append(
                    (
                        (np.array(seg) - t0, [low - 1] * 2, [low - 1 + height] * 2),
                        dict(color=nonecolor, edgecolor="w", linewidth=0.25),
                    )
                )
                # NOTE: we set the edgecolor to white so that there is no
            # visible overlap between "none" and other configs
            ylim = (-1 - dylim, count + dylim)
            count += 1
        else:
            ylim = (-dylim, count + dylim)

        # now create the figure with dynamic sizing based on the number of rows
        rowheight = 0.3
        titleheight = 0.8
        labelheight = 0.6
        figheight = count * rowheight + titleheight + labelheight
        bottom = labelheight / figheight
        top = 1 - titleheight / figheight

        # should be enough to accomodate roughly our longest channel name...
        left = 0.4
        right = 0.9

        # NOTE: we always generate a new figure here because it doesn't make
        # sense to overlay these things for OVL...
        if verbose:
            print("generating figure")
        fig = plots.plt.figure(figsize=(figwidth, figheight))
        # place the main axes
        # NOTE: this could be fragile if someone mucks with the default settings
        ax = fig.add_axes([left, bottom, right - left, top - bottom])

        # place the colorbar
        pos = ax.get_position()
        d = 1 - pos.x1
        cbax = fig.add_axes([pos.x1 + 0.1 * d, pos.y0, 0.3 * d, pos.y1 - pos.y0])

        # add color bar
        if verbose:
            print("    plotting colorbar")
        cb = plots.matplotlib.colorbar.ColorbarBase(
            cbax,
            cmap=cmap,
            norm=plots.matplotlib.colors.Normalize(vmin=0.0, vmax=1.0),
            orientation="vertical",
        )
        if count <= 2:  # avoid overlapping rank labels
            cb.set_ticks([0, 0.5, 1.0])
        cbax.set_ylabel(plots.RANK_LABEL)  # label

        if verbose:
            print("    plotting rows")
        for args, kwargs in rows:
            ax.fill_between(*args, **kwargs)

        # add header/footer showing model provenance
        if verbose:
            print("    plotting header/footer")
        ymin, ymax = ylim
        color = "k"
        SEG = segmentlist([segment(start, end)])
        # shade the spots which were not included in training
        for seg in SEG - (SEG & self.segs):
            seg = np.array(seg) - t0
            ax.fill_between(
                seg,
                [ymin] * 2,
                [ymin + dylim] * 2,
                color=color,
                edgecolor="w",
                alpha=plots.DEFAULT_SHADE_ALPHA,
            )
            ax.fill_between(
                seg,
                [ymax - dylim] * 2,
                [ymax] * 2,
                color=color,
                edgecolor="w",
                alpha=plots.DEFAULT_SHADE_ALPHA,
            )

        # add darker shading to show where model stopped
        for seg in SEG - (SEG & segmentlist([segment(self.start, self.end)])):
            seg = np.array(seg) - t0
            ax.fill_between(
                seg,
                [ymin] * 2,
                [ymin + dylim] * 2,
                color=color,
                alpha=plots.DEFAULT_ALPHA,
            )
            ax.fill_between(
                seg,
                [ymax - dylim] * 2,
                [ymax] * 2,
                color=color,
                alpha=plots.DEFAULT_ALPHA,
            )

        # decorate
        if verbose:
            print("    decorating")
        ax.set_ylim(ylim)

        ax.set_yticks(ticks)
        ax.set_yticklabels(ticklabels, fontdict={"fontsize": 7.0})

        ax.set_xlim(xmin=start - t0, xmax=end - t0)
        ax.set_xlabel("Time [sec] relative to %.3f" % (t0))

        return fig

[docs]    def feature_importance_table(
        self, time, significance, dataset=None, map2str=True, **kwargs
    ):
        """should return (columns, data) compatible with the DQR's
        json.format_table (see use in idq/reports.py"""
        cols = [
            "channel",
            "threshold",
            "window",
            "poisson_signif",
            "use_percentage",
            "eff_fap",
            "rank",
            "cnum_vetod_gch",
            "cnum_vetod_cln",
        ]

        # needed because numpy might break without this...
        data = self.table[cols].copy()

        ngch = self.table["num_glitch"]
        if len(ngch):  # the first element will be non-zero
            data["cnum_vetod_gch"] /= ngch[0]
        cols[-2] = "efficiency"

        ncln = self.table["num_clean"]
        if len(ncln):  # the first element will be non-zero
            data["cnum_vetod_cln"] /= ncln[0]
        cols[-1] = "false alarm probability"

        if dataset:
            if not dataset.is_configured():
                dataset.configure(self.selector)
            dataset.load_data()

            configuration_inds, configuration_segs = self.nonredundant_segments(
                dataset, time, significance
            )
            if len(configuration_inds) > 0:
                data = data[configuration_inds]
            else:
                data = np.array([], dtype=self._table_dtypes)

        data = [list(_) for _ in data]
        cols = cols[:-2]
        if map2str:
            data = [[_[0]] + ["%.3f" % __ for __ in _[1:-2]] for _ in data]

        return cols, data

[docs]    def segments(self, dataset, time, significance, **kwargs):
        """
        generate veto segments for all configurations in the vetolist returns a
        list of segment lists in the order corresponding to the configurations
        within self.table
        """
        return [
            self._configuration2segments(configuration, dataset, time, significance)
            for configuration in self
        ]

    def _configuration2times(
        self, configuration, dataset, time, significance, **kwargs
    ):
        """
        get the times above threshold for this configuration
        """
        channel = configuration["channel"].decode("utf-8")  # extract parameters
        threshold = configuration["threshold"]
        try:
            trgs = dataset.features[channel].filter(
                f"{significance} >= {threshold}", f"{significance} <= {np.infty}"
            )
            # get times above threshold, we already filtered by threshold within
            # call to triggers
            return trgs[time]
        except KeyError:
            # no triggers
            return None

    def _configuration2segments(
        self, configuration, dataset, time, significance, **kwargs
    ):
        """
        generate veto segments for this configuration
        """
        window = configuration["window"]
        aux_times = self._configuration2times(
            configuration, dataset, time, significance
        )
        if aux_times is None:
            return segmentlist()
        else:
            return utils.times2segments(aux_times, window / 2.0)

[docs]    def timeseries(
        self,
        dataset,
        time,
        significance,
        metric="eff_fap",
        dt=DEFAULT_DT,
        segs=None,
        **kwargs,
    ):
        """
        generate timeseries for each segment in dataset.segments separately
        should be useful wihtin OVL.evaluate, vetolist2segments, etc

        iterates from the end of the table to the front, setting the values of
        the timeseries to the metric of the corresponding configuration at each
        step if more than one configuration is active at this time, the final
        timeseries will have the value corresponding to the configuration that's
        closer to the front of the list

          -> this makes sense if you assume the configurations are ordered by
          decending metric, which should be the case

        we extract the 'rank' column, which is set via calls to self.reorder.

        NOTE:

         * monotonic decrease in metric as you move down the table is not
         strictly guaranteed and will depend on whether the list has converged.
         * it is also possible that the list will never converge because of
         cycles (2 configurations flip back and forth forever with additional
         iterations) This may be a weakness of the algorithm!
        """
        if segs is None:
            segs = dataset.segs
        if not dataset.is_configured():
            dataset.configure(self.selector)
        dataset.load_data()

        ranks = [
            (np.zeros_like(t), t) for t in [np.arange(s, e, dt) for s, e in segs]
        ]  # set up zero'd arrays for the ranks
        for configuration in self[::-1]:  # work from the bottom to the top
            aux_segs = self._configuration2segments(
                configuration, dataset, time, significance
            )
            for rank, t in ranks:
                rank[utils.times_in_segments(t, aux_segs)] = configuration["rank"]
        return ranks


# --- OVL configuration performance helper functions

# used when estimating the poisson significance
_gammln_cof = [
    76.18009173,
    -86.50532033,
    24.01409822,
    -1.231739516e0,
    0.120858003e-2,
    -0.536382e-5,
]
_gammln_stp = 2.50662827465


def _gammpln(a, x):
    """
    lower incomplete gamma function
    """
    if a == 0:
        return 0.0  # np.log(1.)
    elif x < 0.0:
        raise ValueError("x<0")
    elif a < 0.0:
        raise ValueError("a<0")
    elif x == 0.0:
        if a == 0.0:
            return 0.0  # np.log(1.)
        else:
            return -np.infty  # np.log(0.)
    elif x < a + 1.0:
        return _gserln(a, x)[0]
    else:
        return np.log(1.0 - _gcf(a, x)[0])


def _gserln(a, x, itmax=10000, eps=3.0e-7):
    """
    Series approx'n to the incomplete gamma function.
    """
    gln = _gammln(a)
    if x < 0.0:
        raise ValueError("x<0")

    elif x == 0.0:
        return 0.0, gln

    logx = np.log(x)
    ap = a
    s = 1.0 / a
    delta = s
    n = 1
    while n <= itmax:
        ap += 1.0
        delta *= x / ap
        s += delta
        if abs(delta) < abs(s) * eps:
            return np.log(s) - x + a * logx - gln, gln
        n += 1
    raise exceptions.MaxIter(itmax)


def _gammln(xx):
    """
    Logarithm of the gamma function.
    """
    x = xx - 1.0
    tmp = x + 5.5
    tmp = (x + 0.5) * np.log(tmp) - tmp
    ser = 1.0
    for j in range(6):
        x += 1.0
        ser += _gammln_cof[j] / x
    return tmp + np.log(_gammln_stp * ser)


def _gcf(a, x, itmax=1000, eps=3.0e-7):
    """
    Continued fraction approx'n of the incomplete gamma function.
    """
    logx = np.log(x)
    gln = _gammln(a)
    gold = 0.0
    a0 = 1.0
    a1 = x
    b0 = 0.0
    b1 = 1.0
    fac = 1.0
    n = 1
    while n <= itmax:
        an = n
        ana = an - a
        a0 = (a1 + a0 * ana) * fac
        b0 = (b1 + b0 * ana) * fac
        anf = an * fac
        a1 = x * a0 + anf * a1
        b1 = x * b0 + anf * b1
        if a1 != 0.0:
            fac = 1.0 / a1
            g = b1 * fac
            if abs((g - gold) / g) < eps:
                return (g * np.exp(-x + a * logx - gln), gln)
            gold = g
        n += 1
    raise exceptions.MaxIter(itmax)


# defined here so that we can pickle this via multiprocessing
def _compute_dovl_configuration_performance_initializer(
    local_dataset, local_model, local_time, local_significance, local_FAPtype
):
    global dataset, model, time, significance, FAPtype
    dataset = local_dataset
    model = local_model
    time = local_time
    significance = local_significance
    FAPtype = local_FAPtype


def _compute_dovl_configuration_performance_wrapper(cind):
    _, statistics = _compute_dovl_configuration_performance(
        dataset, cind, model, time, significance, FAPtype
    )
    return statistics


def _compute_dovl_configuration_performance(
    dataset, ind, model, time, significance, FAPtype
):
    """
    computes this configuration's performance a convenient wrapper that is used
    within _recalculate and to avoid repeating code

    assumes both segments and gch_times are not empty (i.e.: there is data to
    veto)
    """
    num_glitch, num_clean = dataset.vectors2num()

    # figure out what is vetoed by this configuration
    num_vetod_gch = 0.0
    num_vetod_cln = 0.0
    num_aux = 0.0

    contained = []
    configuration = model[ind]
    seg = (
        np.array([-0.5, 0.5]) * configuration["window"]
    )  # will be used repeatedly within the loop
    for vector in dataset:
        # we reference the classifier data stored in each FeatureVector, which
        # is a bit of a cheat but should work... NOTE: if the dataset spans a
        # much larger amount of time than [vector.time - window/2, vector.time +
        # window/2] then the following is a substantial speed up compared to
        # generating a full list of segments repeatedly for each vector the
        # query to dataset may still be expensive (lots of I/O), so there's room
        # for improvement here
        aux_times = model._configuration2times(
            configuration,
            dataset,
            time,
            significance,
        )
        if aux_times is None:
            continue

        # count how many loud things remain nearby
        naux = len(aux_times[utils.times_in_segments(aux_times, [seg + vector.time])])

        if naux:  # at least one loud thing close by
            contained.append(vector)
            num_aux += naux
            num_vetod_gch += vector.label
            num_vetod_cln += 1 - vector.label

    # convert datatable to dataset
    if contained:
        dset = features.Dataset.from_datatable(
            features.DataTable(rows=contained, names=vector.colnames),
            start=dataset.start,
            end=dataset.end,
            segs=dataset.segs,
        )
    else:
        dset = features.Dataset(start=dataset.start, end=dataset.end, segs=dataset.segs)

    # compute ranking metrics
    # we assume num_glitch, num_clean > 0 or else this function would not be called
    eff = num_vetod_gch / num_glitch
    if FAPtype == "DOVL":
        fap = (num_vetod_cln + num_vetod_gch) / (num_clean + num_glitch)
    elif FAPtype == "DOVLfap":
        fap = num_vetod_cln / num_clean
    else:
        raise ValueError("FAPtype=%s not understood" % FAPtype)

    num_vetod_exp = num_glitch * fap
    poisson_signif = -_gammpln(num_vetod_gch, num_vetod_exp)
    # nothing vetod, so we give it a low ranking
    use_percentage = min(num_vetod_gch / num_aux, 1) if num_aux > 0 else 0.0
    # handle this carefully so we don't throw things out that have fap=0 but eff>0
    eff_fap = eff / fap if fap > 0 else np.infty if eff > 0 else 0.0

    # return
    statistics = (
        num_vetod_exp,
        num_vetod_gch,
        num_aux,
        num_vetod_cln,
        poisson_signif,
        use_percentage,
        eff,
        fap,
        eff_fap,
    )
    return dset, statistics


# --- OVL classes themselves


[docs]class DOVL(SupervisedClassifier):
    """
    Discrete OVL: a modified version of OVL that trains based on discrete
    samples to estimate the deadtime. We note that there is an extension of this
    that estimates the False Alarm Probability instead of the deadtime
    (DOVLfap), but still uses discrete samples. The actual implementation of the
    OVL algorithm itself is stored in a subclass (:class:idq.classifiers.OVL)
    because DOVL has standard training signatures while OVL does not

    *WRITE ME* describe what we "overwrite" from OVL (although we don't really
    have convenient access to these...) train (a trivial delegation, and we
    overwrite it simply because we want the signature to have a clearer variable
    name) _recalculate
    """

    _flavor = "dovl"
    _required_kwargs = [
        "incremental",
        "minima",
        "num_recalculate",
        "metric",
        "safe_channels_path",
        "time",
        "significance",
    ]

    _allowed_metrics = "eff_fap use_percentage poisson_signif combined".split()

    _default_incremental = 1000
    _default_minima = {
        "poisson_signif": 10,
        "eff_fap": 3,
    }

    def __init__(self, *args, **kwargs):
        SupervisedClassifier.__init__(self, *args, **kwargs)
        assert (
            self.kwargs["metric"] in self._allowed_metrics
        ), "metric=%s not allowed" % (self.kwargs["metric"])
        assert isinstance(self.kwargs["minima"], dict), "minima must be a dictionary"
        for k in self.kwargs["minima"].keys():
            assert k in self._allowed_metrics, "minima's key=%s not allowed" % (k)

        # record the column names for "time" and "significance"
        self._time = self.kwargs.pop("time")
        self._significance = self.kwargs.pop("significance")

    @property
    def time(self):
        return self._time

    @property
    def significance(self):
        return self._significance

[docs]    def evaluate(self, dataset):
        """
        sets the ranks for these feature vectors
        modifies the objects in place!
        """
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )

        verbose = self.kwargs.get("verbose", False)
        if verbose:
            template = (
                "--- %6d / "
                + ("%6d" % len(dataset))
                + " --- gps = %.6f --- label = %s ---"
            )

        # load/cache triggers
        model = self.model
        if not dataset.is_configured():
            dataset.configure(model.selector)
        dataset.load_data()

        # check required columns are available
        self._check_columns(dataset.features)

        # generate veto configuration
        veto_segs = [
            model._configuration2segments(c, dataset, self._time, self._significance)
            for c in model
        ]

        # evaluate one feature vector at a time
        for ind, feature_vector in enumerate(dataset):
            if verbose:
                print(template % (ind + 1, feature_vector.gps, feature_vector.label))

            # iterate over veto configurations
            for c_idx, configuration in enumerate(model):
                # check if configuration is active
                if utils.time_in_segments(feature_vector.time, veto_segs[c_idx]):
                    feature_vector["rank"] = configuration["rank"]
                    if verbose:
                        print(model._repr_body % tuple(configuration))
                    break

            else:
                # we found nothing active, so set rank to zero
                feature_vector["rank"] = 0
                if verbose:
                    print("not found")

            feature_vector["hash"] = self.model.hash

        return dataset

[docs]    def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None):
        """
        delegates to Vetolist.timeseries
        returns ranks
        """
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        dataset = dataset_factory.unlabeled(dt=dt, segs=segs)
        ranks = self.model.timeseries(
            dataset, self._time, self._significance, dt=dt, segs=segs, **self.kwargs
        )
        return [
            SeriesInfo.from_ranks(
                info, t[0], dt, rank, self.model, self.calibration_map, set_ok=set_ok
            )
            for rank, t in ranks
        ]

[docs]    def redundancies(self, dataset):
        """
        computes the intersection and overlap of vetosegments for each possible
        configuration. This should contain all information necessary to
        determine which channels are redundant.

        we only return information based on the current model, which may have
        trained itself down to a subset of the total possible configurations

        returns table, headers table is a matrix of the livetime of the
        intersections of veto segments from each configuration pair headers is a
        list of the (channel, threshold, window) tuples, with the same order as
        the columns in table
        """
        self._check_columns(dataset.features)

        if not self.is_trained:  # instantiate the Vetolist if needed
            self.model = Vetolist(
                dataset.start,
                dataset.end,
                segs=dataset.segs,
                model_id=self._model_id,
                channels=io.path2channels(self.kwargs["safe_channels_path"]),
                significance=self.significance,
                **self.kwargs,
            )

        return self.model.redundancies(dataset, self._time, self._significance)

[docs]    def feature_importance(self):
        """
        delegates to Vetolist.feature_importance
        """
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        return self.model.feature_importance()

[docs]    def feature_importance_figure(self, dataset, start, end, t0, **kwargs):
        """delegate to Vetolist.feature_importance_figure with a few extra things"""
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        return self.model.feature_importance_figure(
            dataset,
            start,
            end,
            t0,
            self._time,
            self._significance,
            colormap=self.kwargs.get("colormap", plots.DEFAULT_COLORMAP),
            # default to no colors
            nonerow=self.kwargs.get("nonerow", True),
            # default to use what is set by the colorbar
            nonecolor=self.kwargs.get("nonecolor", None),
            **kwargs,
        )

[docs]    def feature_importance_table(self, dataset=None, **kwargs):
        """delegate to Vetolist.feature_importance_table"""
        return self.model.feature_importance_table(
            self._time,
            self._significance,
            dataset=dataset,
            **kwargs,
        )

    def _check_columns(self, features):
        """
        assert that required_columns are present in features
        """
        for column in [self._time, self._significance]:
            for table in features.values():
                assert column in table.colnames, "column=%s required" % column

[docs]    def train(
        self, dataset
    ):  # NOTE: the input signature here is the same as for other classifiers
        """
        Instantiates a Vetolist and trains using the data within a dataset.

        Algorithmic parameters include:

          * channels
          * thresholds
          * windows
          * num_recalculate
          * incremental
          * minima (key, value pairs)

        and are specified through self.kwargs, set during instantiation.

        NOTE: We do not explicitly call `self._check_columns(datachunk)`
        and instead assume the user has already done this when constructing a dataset.
        """
        # set up how we'll report models
        reporter = HDF5Reporter(self.rootdir, dataset.start, dataset.end)
        vetolistnickname = self.nickname + "Vetolist"

        model = Vetolist(
            dataset.start,
            dataset.end,
            segs=dataset.segs,
            model_id=self._model_id,
            channels=io.path2channels(self.kwargs["safe_channels_path"]),
            significance=self.significance,
            **self.kwargs,
        )
        self.model = model  # set up the internal model

        verbose = self.kwargs.get("verbose", False)

        # cache data so we get all I/O out of the way once
        if verbose:
            print("loading triggers within dataset")

        # configure feature selection and load
        if not dataset.is_configured():
            dataset.configure(model.selector)
        dataset.load_data()

        # evaluate the first round without hierarchical removal
        # that's why we do all the jazz with incremental...

        if verbose:
            print("------------------- INITIAL RECALCULATE -----------------------")
            to = TIME.time()

        incremental = self.kwargs["incremental"]
        self.kwargs["incremental"] = 0
        self._recalculate(dataset, **self.kwargs)  # actually compute stuff
        self.kwargs["incremental"] = incremental

        if verbose:
            print("-------------------- %.6f sec" % (TIME.time() - to))

        # write first iteration to disk
        reporter.report(vetolistnickname + "Initial", model.table)

        # re-order the list and prune under-performing configurations
        for metric, minimum in self.kwargs["minima"].items():
            model.prune(minimum, metric=metric)
        model.reorder(**self.kwargs)

        # iterate for num_recalculate
        template = vetolistnickname + "Iter%d"
        for _ in range(self.kwargs["num_recalculate"]):
            if verbose:
                print(
                    "------------------- RECALCULATE : %d -----------------------" % _
                )
                to = TIME.time()

            self._recalculate(dataset, **self.kwargs)  # calculate stuff

            if verbose:
                print("------------------- %.6f sec" % (TIME.time() - to))

            # write iteration to disk
            reporter.report(template % (_), model.table)

            # re-order and prune
            for metric, minimum in self.kwargs["minima"].items():
                model.prune(minimum, metric=metric)
            model.reorder(**self.kwargs)

        # perform a final iteration and then write first iteration to disk
        if verbose:
            print("------------------- FINAL RECALCULATE -----------------------")
            to = TIME.time()
        self._recalculate(dataset, **self.kwargs)
        if verbose:
            print("------------------- %.6f sec" % (TIME.time() - to))

        # re-order and write to disk
        # NOTE: we do NOT prune at this point
        model.reorder(**self.kwargs)
        reporter.report(vetolistnickname + "Final", model.table)

        # return self.model for syntactic consistency with parent
        return self.model

[docs]    def calibrate(self, dataset, bounded=True, **kwargs):
        return super().calibrate(dataset, bounded=bounded, **kwargs)

    def _recalculate(
        self,
        dataset,
        incremental=None,
        minima=None,
        multiprocessing_sleep=DEFAULT_MULTIPROCESSING_SLEEP,
        multiprocessing_chunksize=DEFAULT_MULTIPROCESSING_CHUNKSIZE,
        **kwargs,
    ):
        """
        an implementation of the recalculate algorithm that uses clean samples
        instead of fractional deadtime Can we simply replace a "deadtime
        estimate" function call here and otherwise use the same logic as OVL
        itself? that may be complicated by

        NOTE: we expect each feature in the dataset to contain : columns =
        ['time', 'significance'] that is checked within train once instead of
        being checked here repeatedly
        """
        # make a copy, sharing references to FeatureVectors do this so we can
        # pop crap from working_dataset without affecting dataset (shared
        # reference)
        working_dataset = dataset.copy()

        # do this because specifying in kwargs turns out to be difficult
        # syntactically
        if incremental is None:
            incremental = self._default_incremental
        if minima is None:
            minima = self._default_minima

        num_glitch, num_clean = dataset.vectors2num()

        cnum_vetod_gch = 0.0
        cnum_vetod_cln = 0.0

        model = self.model
        verbose = kwargs.get("verbose", False)
        if verbose:
            print(model._repr_head)
            _repr_body = model._repr_body + "    %.6f sec"

        cind = 0
        len_model = len(model)
        num_proc = self.kwargs.get("num_proc", DEFAULT_NUM_PROC)
        while (cind < len_model) and (
            (incremental > 0) or (num_proc == 1)
        ):  # process serially
            configuration = model[cind]
            if num_glitch and num_clean:  # there's something to veto!
                if verbose:
                    to = TIME.time()

                # compute statistics
                contained, statistics = _compute_dovl_configuration_performance(
                    working_dataset,
                    cind,
                    model,
                    self._time,
                    self._significance,
                    self.__class__.__name__,
                )
                (
                    num_vetod_exp,
                    num_vetod_gch,
                    num_aux,
                    num_vetod_cln,
                    poisson_signif,
                    use_percentage,
                    eff,
                    fap,
                    eff_fap,
                ) = statistics

                # update configuration in place
                configuration["num_glitch"] = num_glitch
                configuration["num_clean"] = num_clean
                configuration["num_aux"] = num_aux
                configuration["num_vetod_exp"] = num_vetod_exp
                configuration["num_vetod_gch"] = num_vetod_gch
                configuration["num_vetod_cln"] = num_vetod_cln
                configuration["poisson_signif"] = poisson_signif
                configuration["use_percentage"] = use_percentage
                configuration["eff"] = eff
                configuration["fap"] = fap
                configuration["eff_fap"] = eff_fap

                # figure out if we'll keep this configuration moving forward
                if incremental:
                    keep = True
                    for metric, threshold in minima.items():
                        keep *= configuration[metric] >= threshold

                    if keep:
                        incremental -= 1

                        num_clean -= num_vetod_cln
                        cnum_vetod_cln += num_vetod_cln

                        num_glitch -= num_vetod_gch
                        cnum_vetod_gch += num_vetod_gch

                        working_dataset -= (
                            contained  # retain only the vectors that were not flagged
                        )
                        # NOTE: modifies dataset in place!

                # finish updating configuration
                configuration["cnum_vetod_gch"] = cnum_vetod_gch
                configuration["cnum_vetod_cln"] = cnum_vetod_cln

                if verbose:
                    print(_repr_body % (tuple(configuration) + (TIME.time() - to,)))

            else:
                # we're missing data of some kind, so we should just skip the
                # rest of the configurations -> zero their contributions
                for name in model._table.dtype.names:
                    if name not in [
                        "channel",
                        "threshold",
                        "window",
                        "num_glitch",
                        "num_clean",
                        "cnum_vetod_gch",
                        "cnum_vetod_cln",
                    ]:
                        model[cind:][name] = 0
                model[cind:]["num_glitch"] = num_glitch
                model[cind:]["num_clean"] = num_clean
                model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch
                model[cind:]["cnum_vetod_cln"] = cnum_vetod_cln
                break

            cind += 1

        if (cind < len_model) and num_glitch and num_clean:
            # we did not finish the list, so there could be something to do
            if verbose:
                print(
                    "processing remaining configurations in parallel with %d processes"
                    % num_proc
                )
            pool = mp.Pool(
                num_proc,
                initializer=_compute_dovl_configuration_performance_initializer,
                initargs=(
                    working_dataset,
                    model,
                    self._time,
                    self._significance,
                    self.__class__.__name__,
                ),
            )

            if verbose:
                to = TIME.time()

            # set the things that will be the same for all remaning configurations
            model[cind:]["num_glitch"] = num_glitch
            model[cind:]["num_clean"] = num_clean
            model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch
            model[cind:]["cnum_vetod_cln"] = cnum_vetod_cln

            # process the rest of the channel's performances in parallel
            result = pool.map_async(
                _compute_dovl_configuration_performance_wrapper,
                range(cind, len(model)),
                chunksize=multiprocessing_chunksize,
            )
            while not result.ready():
                TIME.sleep(multiprocessing_sleep)
            if not result.successful():
                raise RuntimeError("call to multiprocessing.pool.map_async failed")

            for ind, statistics in enumerate(result.get()):
                configuration = model[cind + ind]
                (
                    num_vetod_exp,
                    num_vetod_gch,
                    num_aux,
                    num_vetod_cln,
                    poisson_signif,
                    use_percentage,
                    eff,
                    fap,
                    eff_fap,
                ) = statistics

                # update configuration in place
                configuration["num_aux"] = num_aux
                configuration["num_vetod_exp"] = num_vetod_exp
                configuration["num_vetod_gch"] = num_vetod_gch
                configuration["num_vetod_cln"] = num_vetod_cln
                configuration["poisson_signif"] = poisson_signif
                configuration["use_percentage"] = use_percentage
                configuration["eff"] = eff
                configuration["fap"] = fap
                configuration["eff_fap"] = eff_fap

                if verbose:
                    print(_repr_body % (tuple(configuration) + (TIME.time() - to,)))


[docs]class DOVLfap(DOVL):
    """
    an extension of DOVL that estimates the FAP in a slightly different way.
    DOVL uses an approximation of the deadtime (comparable to what OVL does)

        fap=(num_vetod_gch+num_vetod_cln)/(num_gch+num_cln)

    DOVLfap uses an approximation close to the FAP

        fap=num_vetod_cln/num_clean

    while the FAP might appear more correct, the deadtime is likely to be more
    numerically stable no infinities and thresholds on poisson_signif may prune
    over-trained configureations

    this is controlled based on a conditional within
    _compute_dovl_configuration_performance
    """

    _flavor = "dovlfap"


# --- classic OVL rather than the discrete equivalent
def _compute_ovl_configuration_performance_initializer(
    local_segs,
    local_gch_times,
    local_model,
    local_dataset,
    local_time,
    local_significance,
):
    """should NOT be used outside of multiprocessing"""
    global segs, gch_times, model, dataset, time, significance
    segs = local_segs
    gch_times = local_gch_times
    model = local_model
    dataset = local_dataset
    time = local_time
    significance = local_significance


def _compute_ovl_configuration_performance_wrapper(cind):
    """should NOT be used outside of multiprocessing"""
    _, _, statistics = _compute_ovl_configuration_performance(
        segs, gch_times, cind, model, dataset, time, significance
    )
    return statistics


def _compute_ovl_configuration_performance(
    segs, gch_times, cind, model, dataset, time, significance
):
    """
    computes this configuration's performance
    a convenient wrapper that is used within _recalculate to avoid repeating code

    assumes both segments and gch_times are not empty (i.e.: there is data to veto)
    """
    configuration = model[cind]
    window = configuration["window"]

    # NOTE: we do not filter by segs because that's all handled within
    # _compute_configuration_performance
    aux_segs = model._configuration2segments(configuration, dataset, time, significance)
    aux_segs &= segs

    num_glitch = len(gch_times)
    livetime = utils.livetime(segs)

    # NOTE: the old implementation of OVL did something like this num_aux =
    # np.sum(utils.times_in_segments(aux_times, segs)) # filter by segments to
    # get the count which could lead to the case where there was non-zero
    # livetime in the aux_segs while num_aux=0 instead, we now define num_aux as
    # the "effective number of triggers required to obtain the same amount of
    # aux livetime this could lead to slightly different performance compared to
    # the old implementation
    num_aux = utils.livetime(aux_segs) / window

    # figure out what is removed by this configuration
    # and which times are contained in the segments
    contained = utils.times_in_segments(gch_times, aux_segs)
    num_vetod_gch = 1.0 * np.sum(contained)
    aux_livetime = utils.livetime(aux_segs)

    # compute ranking metrics
    eff = num_vetod_gch / num_glitch
    fap = aux_livetime / livetime

    num_vetod_exp = num_glitch * fap  # (num_glitch/livetime)*aux_livetime
    poisson_signif = -_gammpln(num_vetod_gch, num_vetod_exp)
    # nothing vetod, so we give it a low ranking
    use_percentage = min(num_vetod_gch / num_aux, 1) if num_aux > 0 else 0.0
    # nothing vetod, so we give it a low ranking
    # note, we cannot have eff>0 with fap==0 within OVL, unlike within DOVL
    # return
    eff_fap = eff / fap if fap > 0 else 0.0
    statistics = (
        num_vetod_exp,
        num_vetod_gch,
        num_aux,
        aux_livetime,
        poisson_signif,
        use_percentage,
        eff,
        fap,
        eff_fap,
    )
    return contained, aux_segs, statistics


[docs]class OVL(DOVL):
    """
    a wrapper for the Ordered Veto List (OVL) algorithm published in Essick et
    al, CQG 30, 15 (2013) (DOI: 10.1088/0264-9381/30/15/155010) This algorithm
    estimates False Alarm Probability based on measures of the deadtime
    associated with segments generated around auxiliary events.

    *WRITE ME* set this up so it takes in a path to an output directory and then
    writes the Vetolist, data objects into that directory with appropriate names
    (extract start, end from data objects)

        describe inheritence for the extra attributes not declared within
        SupervisedClassifier _allowed_metrics _default_incremental
        _default_minima _gammln_cof _gammln_stp and the associated methods
        _recalculate redundancies _check_columns _gcf _gammln _gserln _gammpln

    """

    _flavor = "ovl"

[docs]    def train(self, dataset):
        """
        Instantiates a Vetolist and trains using the data within a dataset.

        Algorithmic parameters include:

          * channels
          * thresholds
          * windows
          * num_recalculate
          * incremental
          * minima (key, value pairs)

        and are specified through self.kwargs, set during instantiation.
        """
        # sanity check training data
        num_glitch, _ = dataset.vectors2num()
        if num_glitch == 0:
            raise ValueError(
                "training OVL-based classifiers not allowed with zero target times. "
                "this may indicate an issue with data discovery or not requesting "
                "enough time for training."
            )

        # set up how we'll report models
        reporter = HDF5Reporter(self.rootdir, dataset.start, dataset.end)
        vetolistnickname = self.nickname + "Vetolist"

        model = Vetolist(
            dataset.start,
            dataset.end,
            segs=dataset.segs,
            model_id=self._model_id,
            channels=io.path2channels(self.kwargs["safe_channels_path"]),
            significance=self.significance,
            **self.kwargs,
        )
        self.model = model  # set up the internal model

        verbose = self.kwargs.get("verbose", False)
        if verbose:
            print("loading triggers within dataset")

        # configure feature selection and load
        if not dataset.is_configured():
            dataset.configure(model.selector)
        dataset.load_data()

        # check required columns are available
        self._check_columns(dataset.features)

        # evaluate the first round without hierarchical removal
        # that's why we do all the jazz with incremental...
        if verbose:
            print("------------------- INITIAL RECALCULATE -----------------------")
            to = TIME.time()

        incremental = self.kwargs["incremental"]
        self.kwargs["incremental"] = 0
        self._recalculate(dataset, **self.kwargs)  # actually compute stuff
        self.kwargs["incremental"] = incremental

        if verbose:
            print("-------------------- %.6f sec" % (TIME.time() - to))

        # write first iteration to disk
        reporter.report(vetolistnickname + "Initial", model.table)

        # re-order the list and prune under-performing configurations
        for metric, minimum in self.kwargs["minima"].items():
            model.prune(minimum, metric=metric)
        model.reorder(**self.kwargs)

        # iterate for num_recalculate
        template = vetolistnickname + "Iter%d"
        for _ in range(self.kwargs["num_recalculate"]):
            if verbose:
                print(
                    "------------------- RECALCULATE : %d -----------------------" % _
                )
                to = TIME.time()

            self._recalculate(dataset, **self.kwargs)  # calculate stuff

            if verbose:
                print("------------------- %.6f sec" % (TIME.time() - to))

            # write iteration to disk
            reporter.report(template % (_), model.table)

            # re-order and prune
            for metric, minimum in self.kwargs["minima"].items():
                model.prune(minimum, metric=metric)
            model.reorder(**self.kwargs)

        # iterate once more and write final iteration to disk
        if verbose:
            print("------------------- FINAL RECALCULATE -----------------------")
            to = TIME.time()
        self._recalculate(dataset, **self.kwargs)  # actually compute stuff
        if verbose:
            print("-------------------- %.6f sec" % (TIME.time() - to))

        # NOTE: we do NOT prune at this point!
        model.reorder(**self.kwargs)
        reporter.report(vetolistnickname + "Final", model.table)

        # return self.model for syntactic consistency with parent
        return self.model

    def _recalculate(
        self,
        dataset,
        incremental=None,
        minima=None,
        memory_light=False,
        multiprocessing_sleep=DEFAULT_MULTIPROCESSING_SLEEP,
        multiprocessing_chunksize=DEFAULT_MULTIPROCESSING_CHUNKSIZE,
        **kwargs,
    ):
        """
        the main learning calculation associated with OVL calculate the
        performance of a set of ordered veto configurations on data does NOT
        re-order or prune the list for you; you must do that by hand assumes
        self.model already points to a Vetolist object

        NOTE: we expect the dataset to contain : columns = [self._time,
        self._significance] that is checked within train once instead of being
        checked here repeatedly
        """
        # do this because specifying in kwargs turns out to be difficult syntactically
        if incremental is None:
            incremental = self._default_incremental
        if minima is None:
            minima = self._default_minima

        gch, _ = dataset.vectors2classes()
        gch_times = np.array(gch.times)  # make a copy for internal consumption
        num_glitch = len(gch_times)
        cnum_vetod_gch = 0

        # list of segments representing remaining time, will be built up
        # throughout the loop for hierarchical application of configurations
        segs = segmentlist(
            dataset.segs
        )  # make a copy so we don't update a shared reference accidentally
        livetime = utils.livetime(segs)
        cveto_livetime = 0

        model = self.model
        verbose = kwargs.get("verbose", False)
        if verbose:
            print(model._repr_head)
            _repr_body = model._repr_body + "    %.6f sec"

        cind = 0
        len_model = len(model)
        num_proc = self.kwargs.get("num_proc", DEFAULT_NUM_PROC)
        while (cind < len_model) and ((incremental > 0) or (num_proc == 1)):
            # process serially, iterate over vetolist
            configuration = model[cind]
            if livetime and num_glitch:
                if verbose:
                    to = TIME.time()

                # compute statistics
                (
                    contained,
                    aux_segs,
                    statistics,
                ) = _compute_ovl_configuration_performance(
                    segs,
                    gch_times,
                    cind,
                    model,
                    dataset,
                    self._time,
                    self._significance,
                )
                (
                    num_vetod_exp,
                    num_vetod_gch,
                    num_aux,
                    aux_livetime,
                    poisson_signif,
                    use_percentage,
                    eff,
                    fap,
                    eff_fap,
                ) = statistics

                # update configuration in place
                configuration["num_glitch"] = num_glitch
                configuration["num_clean"] = livetime
                configuration["num_aux"] = num_aux
                configuration["num_vetod_exp"] = num_vetod_exp
                configuration["num_vetod_gch"] = num_vetod_gch
                configuration["num_vetod_cln"] = aux_livetime
                configuration["poisson_signif"] = poisson_signif
                configuration["use_percentage"] = use_percentage
                configuration["eff"] = eff
                configuration["fap"] = fap
                configuration["eff_fap"] = eff_fap

                # figure out if we'll keep this configuration moving forward
                if incremental:
                    keep = True
                    for metric, threshold in minima.items():
                        keep *= configuration[metric] >= threshold

                    if keep:
                        incremental -= 1

                        livetime -= aux_livetime
                        cveto_livetime += aux_livetime

                        num_glitch -= num_vetod_gch
                        cnum_vetod_gch += num_vetod_gch

                        segs -= aux_segs  # remove the vetod segments
                        gch_times = gch_times[
                            np.logical_not(contained)
                        ]  # retain only the triggers that were not flagged

                # finish updating configuration
                configuration["cnum_vetod_gch"] = cnum_vetod_gch
                configuration["cnum_vetod_cln"] = cveto_livetime

                # pop this channel to free up memory
                if memory_light:
                    dataset.features.pop(
                        configuration["channel"]
                    )  # garbage collector should gobble up the returned value

                if verbose:
                    print(_repr_body % (tuple(configuration) + (TIME.time() - to,)))

            else:
                # we're missing data of some kind, so we should just skip the
                # rest of the configurations -> zero their contributions
                for name in model._table.dtype.names:
                    if name not in [
                        "channel",
                        "threshold",
                        "window",
                        "num_glitch",
                        "num_clean",
                        "cnum_vetod_gch",
                        "cnum_vetod_cln",
                    ]:
                        model[cind:][name] = 0
                model[cind:]["num_glitch"] = num_glitch
                model[cind:]["num_clean"] = livetime
                model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch
                model[cind:]["cnum_vetod_cln"] = cveto_livetime
                break

            cind += 1  # increment the counter

        if (cind < len_model) and num_glitch and livetime:
            # we did not finish the list before incremental ran out and we want
            # to run on more than one core
            if verbose:
                print(
                    "processing remaining configurations in parallel with %d processes"
                    % num_proc
                )
            pool = mp.Pool(
                num_proc,
                initializer=_compute_ovl_configuration_performance_initializer,
                initargs=(
                    segs,
                    gch_times,
                    model,
                    dataset,
                    self._time,
                    self._significance,
                ),
            )

            if verbose:
                to = TIME.time()

            # set the things that will be the same for all remaning configurations
            model[cind:]["num_glitch"] = num_glitch
            model[cind:]["num_clean"] = livetime
            model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch
            model[cind:]["cnum_vetod_cln"] = cveto_livetime

            # process the rest of the channel's performances in parallel
            result = pool.map_async(
                _compute_ovl_configuration_performance_wrapper, range(cind, len(model))
            )
            while not result.ready():
                TIME.sleep(multiprocessing_sleep)
            if not result.successful():
                raise RuntimeError("call to multiprocessing.pool.map_async failed")

            for ind, statistics in enumerate(result.get()):
                configuration = model[cind + ind]
                (
                    num_vetod_exp,
                    num_vetod_gch,
                    num_aux,
                    aux_livetime,
                    poisson_signif,
                    use_percentage,
                    eff,
                    fap,
                    eff_fap,
                ) = statistics

                # update configuration in place
                configuration["num_aux"] = num_aux
                configuration["num_vetod_exp"] = num_vetod_exp
                configuration["num_vetod_gch"] = num_vetod_gch
                configuration["num_vetod_cln"] = aux_livetime
                configuration["poisson_signif"] = poisson_signif
                configuration["use_percentage"] = use_percentage
                configuration["eff"] = eff
                configuration["fap"] = fap
                configuration["eff_fap"] = eff_fap

                # pop this channel to free up memory
                if memory_light:
                    dataset.features.pop(
                        configuration["channel"]
                    )  # garbage collector should gobble up the returned value

                if verbose:
                    print(_repr_body % (tuple(configuration) + (TIME.time() - to,)))


@hookimpl
def get_classifiers():
    return {
        "ovl": OVL,
        "dovl": DOVL,
        "dovl-fap": DOVLfap,
    }