Source code for idq.classifiers.ovl

import multiprocessing as mp
import time as TIME

import numpy as np
import h5py

from ligo.segments import segment, segmentlist
from gwpy.segments import SegmentList, DataQualityFlag, DataQualityDict

from .. import exceptions
from .. import features
from .. import hookimpl
from .. import plots
from .. import utils
from .. import io
from ..io.reporters.hdf5 import HDF5Reporter
from ..series import SeriesInfo
from . import SupervisedClassifier, ClassifierModel, DEFAULT_DT


DEFAULT_NUM_PROC = 1
# how long we sleep between checking that mp.map_async results have finished
DEFAULT_MULTIPROCESSING_SLEEP = 0.001
DEFAULT_MULTIPROCESSING_CHUNKSIZE = None


[docs]class Vetolist(ClassifierModel): """ a representation of the OVL and DOVL vetolist this is stored as the internal model within OVL and DOVL *WRITE ME* decribe protections of attributes and their meanings _default_thresholds _default_windows _table_dtypes _repr_head _repr_body """ _default_thresholds = [15, 20, 25, 30, 35, 45, 55, 100, 500, 1000] _default_windows = [0.500, 0.100, 0.050, 0.025, 0.010, 0.005, 0.001] _table_dtypes = [ ("channel", "|S75"), ("threshold", "float"), ("window", "float"), ("num_glitch", "float"), ("num_clean", "float"), # like livetime ("num_aux", "float"), ("num_vetod_exp", "float"), ("num_vetod_gch", "float"), ("num_vetod_cln", "float"), # like dsec ("poisson_signif", "float"), ("use_percentage", "float"), ("eff", "float"), ("fap", "float"), ("eff_fap", "float"), ("cnum_vetod_gch", "float"), ("cnum_vetod_cln", "float"), ("rank", "float"), ] _repr_body = " ".join( "%-40s" if "|S" in dtype[1] else "%15.9f" for dtype in _table_dtypes ) # NOTE: this may be fragile... _repr_head = ( " ".join( "%-40s" if dtype[0] == "channel" else "%15s" for dtype in _table_dtypes ) ) % tuple( dtype[0] for dtype in _table_dtypes ) # NOTE: this may be fragile... def __init__( self, start, end, segs=None, model_id=None, generate_id=False, channels=None, thresholds=None, windows=None, significance=None, **kwargs, ): ClassifierModel.__init__( self, start, end, segs=segs, model_id=model_id, generate_id=generate_id ) if not channels: channels = [] if not thresholds: thresholds = self._default_thresholds if not windows: windows = self._default_windows self._channels = channels self._thresholds = thresholds self._windows = windows self.significance = significance # set up table self.init_table() # select how features are extracted self.selector = features.Selector( self.relevant_channels, bounds={ self.significance: (self.minimum_threshold, np.inf), **kwargs.get("bounds", {}), }, ) @property def channels(self): return self._channels @channels.setter def channels(self, channels): for chan in channels: assert isinstance(chan, str), "channels must be strings" self._channels = channels @property def relevant_channels(self): return set(channel.decode("utf-8") for channel in self._table["channel"]) @property def thresholds(self): return self._thresholds @thresholds.setter def thresholds(self, thresholds): for thr in thresholds: assert isinstance( thr, (int, float) ), "thresholds must be integers or floats" self._thresholds = thresholds @property def relevant_thresholds(self): return set(self._table["threshold"]) @property def minimum_threshold(self): return np.min(self["threshold"]) if len(self["threshold"]) else np.infty @property def windows(self): return self._windows @windows.setter def windows(self, windows): for win in windows: assert isinstance(win, (int, float)), "windows must be integers or floats" self._windows = windows @property def relevant_windows(self): return set(self._table["window"]) def __iter__(self): """ returns a generator that will iterate over the internal table """ for line in self._table: yield line def __getitem__(self, index): return self._table[index] def __len__(self): return len(self._table) def __repr__(self): return str(self._table) def __str__(self): """ makes a nice ASCII formatted table instead of just printing the structured array "as is", which is what __repr__ does """ s = [self._repr_head] for configuration in self: s.append(self._repr_body % tuple(configuration)) return "\n".join(s)
[docs] def dump(self, path, nickname=None, **kwargs): """ write this object into path should contain all info, not just the configuration order Uses hdf5 format, and asserts that the path must end in ".hdf5" we can add attributes to this file via the kwargs specified in this function's signature WARNING: we may want to add default attrs to the hdf5 file, like the code's version and the git repo location, etc. we also may want to add information about how this file was produced, but that may be optional and can be specified via kwargs NOTE: we'll likely use an instance of HDF5Reporter rather than this method """ assert path.endswith(".hdf5"), "path=%s must end with .hdf5" % (path) if nickname is None: nickname = "vetolist" with h5py.File(path, "w") as file_obj: dataset = file_obj.create_dataset(nickname, data=self._table) dataset.attrs.update(kwargs)
[docs] def load(self, path, nickname=None, verbose=True): """ load the data from file into memory will load all info, not just hte configuration order Uses hdf5 format, and asserts that the path must end in ".hdf5" if verbose: print attrs from dataset NOTE: we'll likely use an instance of HDF5Reporter rather than this method """ assert path.endswith(".hdf5"), "path=%s must end with .hdf5" % (path) with h5py.File(path, "r") as file_obj: if nickname is None: keys = list(file_obj.keys()) assert ( len(keys) == 1 ), "exactly 1 key required in the HDF5 file unless a nickname is chosen" # expect a single key and just use that this makes it a bit more # independent of who wrote this to disk dataset = file_obj[keys[0]] else: dataset = file_obj[nickname] # extract the numpy array from the data set self._table = dataset[...] # update which channels are present self.channels = set( channel.decode("utf-8") for channel in self.table["channel"] ) self.windows = set(self.table["window"]) self.thresholds = set(self.table["threshold"]) if verbose: for k, v in dataset.attrs.items(): print("%s\t%s" % (k, v))
[docs] def init_table(self): """ instantiate the table used to store data internally """ self._table = [] for channel in self.channels: for threshold in self.thresholds: for window in self.windows: self._table.append( (channel, threshold, window) + (np.nan,) * (len(self._table_dtypes) - 3) ) # NOTE: we don't delegate to set_table because we don't need to check # the table for consistency... self._table = np.array(self._table, dtype=self._table_dtypes)
@property def table(self): return self._table @table.setter def table(self, table): """ WARNING: do not use this unless you really know what you're doing! """ assert isinstance(table, np.ndarray), "table is not a numpy.ndarray!" assert len(table.dtype) == len( self._table_dtypes ), "new table has the wrong number of columns!" for a, b in zip(table.dtype.names, self._table_dtypes): assert a == b[0], "ordering of new table is inconsistent with Vetolist!" assert table.dtype[a] == b[1], ( "types of new table are inconsistent with Vetolist! " "%s (new) vs %s (old)" % (table.dtype[a], b[1]) ) channels = self.channels for channel in set(table["channel"]): assert ( channel.decode("utf-8") in channels ), "new table has inconsistent channels with this Vetolist!" thresholds = self.thresholds for threshold in set(table["threshold"]): assert ( threshold in thresholds ), "new table has inconsistent thresholds with this Vetolist!" windows = self.windows for window in set(table["window"]): assert ( window in windows ), "new table has inconsistent windows with this Vetolist!" self._table = table
[docs] def metric2rank(self, value, metric="eff_fap", **kwargs): """ map our metric value into a rank we define a somewhat ad-hoc monotonic function with a separate scale for each metric """ if metric == "use_percentage": return self._use_percentage_to_rank(value) elif metric == "poisson_signif": return self._poisson_signif_to_rank(value) elif metric == "eff_fap": return self._eff_fap_to_rank(value) else: raise ValueError("metric=%s not understood" % metric)
@staticmethod def _use_percentage_to_rank(val): """ map use_percentage metric to a rank between 0 and 1. """ return val @staticmethod def _poisson_signif_to_rank(val): """ map poisson_signif metric to a rank between 0 and 1. """ score = 0.009 * np.power(val, 1.5) return score / (1 + score) @staticmethod def _eff_fap_to_rank(val): """ map eff_fap metric to a rank between 0 and 1. """ score = 0.02 * val return score / (1 + score)
[docs] def reorder(self, metric="eff_fap", metric_weights=None, **kwargs): """ re-order the vetolist according to this metric we always sort so that bigger numbers show up first """ # assign ranks if metric == "combined": metric_names = ("eff_fap", "use_percentage", "poisson_signif") if metric_weights: weights = [metric_weights[metric] for metric in metric_names] else: # use equal weights weights = [1 / 3.0, 1 / 3.0, 1 / 3.0] metrics = np.array( [self.metric2rank(self.table[m], metric=m) for m in metric_names] ) self.table["rank"] = np.average(metrics, axis=0, weights=weights) else: self.table["rank"] = self.metric2rank(self.table[metric], metric=metric) # reorder - sort by metric, then by threshold, then by window size # NOTE: the old implementation of OVL did not re-order based on windows; # it only used the thresholds and then the metric. this can produce # slightly different orderings, but they probably will be statistically # equivalent regarding performance order = [ (ind, config["window"], config["threshold"], config["rank"]) for ind, config in enumerate(self) ] order.sort(key=lambda x: (x[3], x[2], -x[1]), reverse=True) self.table = self.table[[veto[0] for veto in order]]
[docs] def prune(self, minimum, metric="eff_fap", **kwargs): """ remove all configurations with metric<minimum WARNING: * this will cause data to be forgotten in an unrecoverable way! * only use this if you are certain you know what you're doing! """ self.table = self.table[self.table[metric] >= minimum]
[docs] def redundancies(self, dataset, time, significance, verbose=False, Verbose=False): """ computes the intersection and overlap of vetosegments for each possible configuration. This should contain all information necessary to determine which channels are redundant. we only return information based on the current model, which may have trained itself down to a subset of the total possible configurations returns table, headers table is a matrix of the livetime of the intersections of veto segments from each configuration pair headers is a list of the (channel, threshold, window) tuples, with the same order as the columns in table """ verbose |= Verbose # set up the data product N = len(self) table = np.empty((N, N), dtype=float) headers = self[["channel", "threshold", "window"]] if verbose: print("caching triggers within dataset") if not dataset.is_configured(): dataset.configure(self.selector) dataset.load_data() # iterate over configurations if verbose: print("computing aux segments") auxsegs = [] for config in self: # compute segments once for each configuration to avoid repeating # work in the nested loop segs = self._configuration2segments(config, dataset, time, significance) auxsegs.append(dataset.segs & segs) if verbose: print("computing overlap between veto configurations") tmp = "%d - %d" for i, config1 in enumerate(self): if Verbose: print(tmp % (i, i)) segs1 = auxsegs[i] # livetime of segments intersected with themselves table[i, i] = utils.livetime(segs1) # iterate over the rest of the configurations and fill in the # intersections for j, config2 in enumerate(self[i + 1 :], 1): if Verbose: print(tmp % (i, j)) segs2 = auxsegs[i + j] table[i, i + j] = table[i + j, i] = utils.livetime(segs1 & segs2) # return data return table, headers
[docs] def nonredundant_segments(self, dataset, time, significance): """ Checks which configurations are redundant and then returns the indices of the non-redundant configurations, and their segments """ intersect_table, _ = self.redundancies(dataset, time, significance) segments = self.segments(dataset, time, significance) keep_inds = [] for i, veto_segs in enumerate(segments): # live time is element (i,i) of intersect_table # check if veto was active if intersect_table[i, i] == 0: continue # check if redundant with any previous veto # redundant if same channel, and segments are a subset of other vetoes keep = True for j, other_segs in enumerate(segments[:i]): # check if channels are the same if self.table[i]["channel"] != self.table[j]["channel"]: continue # veto i is redundant with veto j if livetime # of their intersection equals i's livetime if intersect_table[i, i] == intersect_table[i, j]: keep = False break if keep: keep_inds.append(i) return np.array(keep_inds), [segments[i] for i in keep_inds]
[docs] def save_as_data_quality_flags( self, out_path, dataset, time, significance, remove_redundant=True ): """ Saves the veto list as a set of data quality flags in an hdf5 file """ if remove_redundant: inds, configuration_segments = self.nonredundant_segments( dataset, time, significance ) if len(inds) > 0: configurations = self[inds] else: configurations = np.array([]) else: configurations = self[:] configuration_segments = self.segments(dataset, time, significance) veto_dict = DataQualityDict() lookup_dict = {} pairs = zip(configurations, configuration_segments) for i, (configuration, aux_segs) in enumerate(pairs): channel = configuration["channel"].decode("utf-8") threshold = configuration["threshold"] window = configuration["window"] name = f"{channel}_{threshold}_{window}:1" active = SegmentList(aux_segs) known = dataset.segs dq_flag = DataQualityFlag( name=name, active=active, known=known, isgood=False ) veto_dict[name] = dq_flag lookup_dict[name] = configuration veto_dict.write(out_path) with h5py.File(out_path, "r+") as f: for name, grp in f.items(): configuration = lookup_dict[name] grp.attrs["channel"] = configuration["channel"].decode("utf-8") grp.attrs["threshold"] = configuration["threshold"] grp.attrs["window"] = configuration["window"] grp.attrs["rank"] = configuration["rank"] grp.attrs["eff"] = configuration["eff"] grp.attrs["fap"] = configuration["fap"] grp.attrs["use_percentage"] = configuration["use_percentage"] grp.attrs["poisson_signif"] = configuration["poisson_signif"]
[docs] def feature_importance(self): """ should essentially be an ordered list of veto configurations although this is really a structured numpy array """ return self.table[["channel"], ["threshold"], ["window"]]
[docs] def feature_importance_figure( self, dataset, start, end, t0, time, significance, colormap=plots.DEFAULT_COLORMAP, nonerow=True, nonecolor=None, verbose=False, **kwargs, ): """generate and return a figure demonstrating the feature importance based on the data within dataset; should return a figure object.""" # go dig up data first because we dynamically size the figure based on # the number of rows grab what we need from the dataset factory if verbose: print("retrieving triggers") if not dataset.is_configured(): dataset.configure(self.selector) dataset.load_data() if verbose: print("setting up plotting options") cmap = plots.plt.get_cmap(colormap) dylim = 0.10 low = 0.01 # used for plotting height = 1 - 2 * low # yticklabel template tmp = ( "%50s\n" + r"$\mathrm{" + significance + r"}\geq%.1f$, " + r"$|\Delta\mathrm{" + time + r"}|\leq%.3f s$, $\mathrm{rank}=%.3f$" ) # map linewidth to time span figwidth = 10 seg_linewidth = 0.25 linewidth_t = seg_linewidth / ((figwidth * 72) / (end - start)) rows = [] ticklabels = [] # used for labeling ticks = [] count = 0 segs = segmentlist([segment(start, end)]) none_segments = segmentlist( [segment(start, end)] ) # used to keep track of times when nothing was active if nonecolor is None: nonecolor = cmap(0) else: nonecolor = nonecolor if verbose: print("iterating over configurations") configuration_inds, configuration_segs = self.nonredundant_segments( dataset, time, significance ) if len(configuration_inds) > 0: configurations = self[configuration_inds] else: configurations = np.array([]) reverse_order = zip(configurations[::-1], configuration_segs[::-1]) for configuration, aux_segs in reverse_order: # put most important things at the top of the figure if verbose: print(" %s" % configuration) # go find the segs for this channel aux_segs = segs & aux_segs if utils.livetime(aux_segs): # there is some time when this was active # expand segments to a minimum size t depending on linewidth aux_segs = utils.expand_segments(aux_segs, linewidth_t) if nonerow: # there is a "None" row, so remove these from the none_segments none_segments -= aux_segs else: # no "None" row, so plot a background bar rows.append( ( ( [start - t0, end - t0], [low + count] * 2, [low + count + height] * 2, ), dict(color=nonecolor, edgecolor="w", linewidth=1.00), ) ) # plot the active segments color = cmap(configuration["rank"]) for seg in aux_segs: rows.append( ( ( np.array(seg) - t0, [low + count] * 2, [low + count + height] * 2, ), dict(color=color, edgecolor=color, linewidth=seg_linewidth), ) ) # NOTE: we play with edgecolors and linewidths here to make # sure all segments are visible (linewidth is effectively a # minimum width) ticklabels.append( tmp % ( configuration["channel"].decode("utf-8"), configuration["threshold"], configuration["window"], configuration["rank"], ) ) ticks.append(count + 0.5) count += 1 if nonerow or (count == 0): # there is a None row or there are no active configs ticklabels.insert(0, "No Veto") ticks.insert(0, -0.5) # plot the none segments for seg in none_segments: rows.append( ( (np.array(seg) - t0, [low - 1] * 2, [low - 1 + height] * 2), dict(color=nonecolor, edgecolor="w", linewidth=0.25), ) ) # NOTE: we set the edgecolor to white so that there is no # visible overlap between "none" and other configs ylim = (-1 - dylim, count + dylim) count += 1 else: ylim = (-dylim, count + dylim) # now create the figure with dynamic sizing based on the number of rows rowheight = 0.3 titleheight = 0.8 labelheight = 0.6 figheight = count * rowheight + titleheight + labelheight bottom = labelheight / figheight top = 1 - titleheight / figheight # should be enough to accomodate roughly our longest channel name... left = 0.4 right = 0.9 # NOTE: we always generate a new figure here because it doesn't make # sense to overlay these things for OVL... if verbose: print("generating figure") fig = plots.plt.figure(figsize=(figwidth, figheight)) # place the main axes # NOTE: this could be fragile if someone mucks with the default settings ax = fig.add_axes([left, bottom, right - left, top - bottom]) # place the colorbar pos = ax.get_position() d = 1 - pos.x1 cbax = fig.add_axes([pos.x1 + 0.1 * d, pos.y0, 0.3 * d, pos.y1 - pos.y0]) # add color bar if verbose: print(" plotting colorbar") cb = plots.matplotlib.colorbar.ColorbarBase( cbax, cmap=cmap, norm=plots.matplotlib.colors.Normalize(vmin=0.0, vmax=1.0), orientation="vertical", ) if count <= 2: # avoid overlapping rank labels cb.set_ticks([0, 0.5, 1.0]) cbax.set_ylabel(plots.RANK_LABEL) # label if verbose: print(" plotting rows") for args, kwargs in rows: ax.fill_between(*args, **kwargs) # add header/footer showing model provenance if verbose: print(" plotting header/footer") ymin, ymax = ylim color = "k" SEG = segmentlist([segment(start, end)]) # shade the spots which were not included in training for seg in SEG - (SEG & self.segs): seg = np.array(seg) - t0 ax.fill_between( seg, [ymin] * 2, [ymin + dylim] * 2, color=color, edgecolor="w", alpha=plots.DEFAULT_SHADE_ALPHA, ) ax.fill_between( seg, [ymax - dylim] * 2, [ymax] * 2, color=color, edgecolor="w", alpha=plots.DEFAULT_SHADE_ALPHA, ) # add darker shading to show where model stopped for seg in SEG - (SEG & segmentlist([segment(self.start, self.end)])): seg = np.array(seg) - t0 ax.fill_between( seg, [ymin] * 2, [ymin + dylim] * 2, color=color, alpha=plots.DEFAULT_ALPHA, ) ax.fill_between( seg, [ymax - dylim] * 2, [ymax] * 2, color=color, alpha=plots.DEFAULT_ALPHA, ) # decorate if verbose: print(" decorating") ax.set_ylim(ylim) ax.set_yticks(ticks) ax.set_yticklabels(ticklabels, fontdict={"fontsize": 7.0}) ax.set_xlim(xmin=start - t0, xmax=end - t0) ax.set_xlabel("Time [sec] relative to %.3f" % (t0)) return fig
[docs] def feature_importance_table( self, time, significance, dataset=None, map2str=True, **kwargs ): """should return (columns, data) compatible with the DQR's json.format_table (see use in idq/reports.py""" cols = [ "channel", "threshold", "window", "poisson_signif", "use_percentage", "eff_fap", "rank", "cnum_vetod_gch", "cnum_vetod_cln", ] # needed because numpy might break without this... data = self.table[cols].copy() ngch = self.table["num_glitch"] if len(ngch): # the first element will be non-zero data["cnum_vetod_gch"] /= ngch[0] cols[-2] = "efficiency" ncln = self.table["num_clean"] if len(ncln): # the first element will be non-zero data["cnum_vetod_cln"] /= ncln[0] cols[-1] = "false alarm probability" if dataset: if not dataset.is_configured(): dataset.configure(self.selector) dataset.load_data() configuration_inds, configuration_segs = self.nonredundant_segments( dataset, time, significance ) if len(configuration_inds) > 0: data = data[configuration_inds] else: data = np.array([], dtype=self._table_dtypes) data = [list(_) for _ in data] cols = cols[:-2] if map2str: data = [[_[0]] + ["%.3f" % __ for __ in _[1:-2]] for _ in data] return cols, data
[docs] def segments(self, dataset, time, significance, **kwargs): """ generate veto segments for all configurations in the vetolist returns a list of segment lists in the order corresponding to the configurations within self.table """ return [ self._configuration2segments(configuration, dataset, time, significance) for configuration in self ]
def _configuration2times( self, configuration, dataset, time, significance, **kwargs ): """ get the times above threshold for this configuration """ channel = configuration["channel"].decode("utf-8") # extract parameters threshold = configuration["threshold"] try: trgs = dataset.features[channel].filter( f"{significance} >= {threshold}", f"{significance} <= {np.infty}" ) # get times above threshold, we already filtered by threshold within # call to triggers return trgs[time] except KeyError: # no triggers return None def _configuration2segments( self, configuration, dataset, time, significance, **kwargs ): """ generate veto segments for this configuration """ window = configuration["window"] aux_times = self._configuration2times( configuration, dataset, time, significance ) if aux_times is None: return segmentlist() else: return utils.times2segments(aux_times, window / 2.0)
[docs] def timeseries( self, dataset, time, significance, metric="eff_fap", dt=DEFAULT_DT, segs=None, **kwargs, ): """ generate timeseries for each segment in dataset.segments separately should be useful wihtin OVL.evaluate, vetolist2segments, etc iterates from the end of the table to the front, setting the values of the timeseries to the metric of the corresponding configuration at each step if more than one configuration is active at this time, the final timeseries will have the value corresponding to the configuration that's closer to the front of the list -> this makes sense if you assume the configurations are ordered by decending metric, which should be the case we extract the 'rank' column, which is set via calls to self.reorder. NOTE: * monotonic decrease in metric as you move down the table is not strictly guaranteed and will depend on whether the list has converged. * it is also possible that the list will never converge because of cycles (2 configurations flip back and forth forever with additional iterations) This may be a weakness of the algorithm! """ if segs is None: segs = dataset.segs if not dataset.is_configured(): dataset.configure(self.selector) dataset.load_data() ranks = [ (np.zeros_like(t), t) for t in [np.arange(s, e, dt) for s, e in segs] ] # set up zero'd arrays for the ranks for configuration in self[::-1]: # work from the bottom to the top aux_segs = self._configuration2segments( configuration, dataset, time, significance ) for rank, t in ranks: rank[utils.times_in_segments(t, aux_segs)] = configuration["rank"] return ranks
# --- OVL configuration performance helper functions # used when estimating the poisson significance _gammln_cof = [ 76.18009173, -86.50532033, 24.01409822, -1.231739516e0, 0.120858003e-2, -0.536382e-5, ] _gammln_stp = 2.50662827465 def _gammpln(a, x): """ lower incomplete gamma function """ if a == 0: return 0.0 # np.log(1.) elif x < 0.0: raise ValueError("x<0") elif a < 0.0: raise ValueError("a<0") elif x == 0.0: if a == 0.0: return 0.0 # np.log(1.) else: return -np.infty # np.log(0.) elif x < a + 1.0: return _gserln(a, x)[0] else: return np.log(1.0 - _gcf(a, x)[0]) def _gserln(a, x, itmax=10000, eps=3.0e-7): """ Series approx'n to the incomplete gamma function. """ gln = _gammln(a) if x < 0.0: raise ValueError("x<0") elif x == 0.0: return 0.0, gln logx = np.log(x) ap = a s = 1.0 / a delta = s n = 1 while n <= itmax: ap += 1.0 delta *= x / ap s += delta if abs(delta) < abs(s) * eps: return np.log(s) - x + a * logx - gln, gln n += 1 raise exceptions.MaxIter(itmax) def _gammln(xx): """ Logarithm of the gamma function. """ x = xx - 1.0 tmp = x + 5.5 tmp = (x + 0.5) * np.log(tmp) - tmp ser = 1.0 for j in range(6): x += 1.0 ser += _gammln_cof[j] / x return tmp + np.log(_gammln_stp * ser) def _gcf(a, x, itmax=1000, eps=3.0e-7): """ Continued fraction approx'n of the incomplete gamma function. """ logx = np.log(x) gln = _gammln(a) gold = 0.0 a0 = 1.0 a1 = x b0 = 0.0 b1 = 1.0 fac = 1.0 n = 1 while n <= itmax: an = n ana = an - a a0 = (a1 + a0 * ana) * fac b0 = (b1 + b0 * ana) * fac anf = an * fac a1 = x * a0 + anf * a1 b1 = x * b0 + anf * b1 if a1 != 0.0: fac = 1.0 / a1 g = b1 * fac if abs((g - gold) / g) < eps: return (g * np.exp(-x + a * logx - gln), gln) gold = g n += 1 raise exceptions.MaxIter(itmax) # defined here so that we can pickle this via multiprocessing def _compute_dovl_configuration_performance_initializer( local_dataset, local_model, local_time, local_significance, local_FAPtype ): global dataset, model, time, significance, FAPtype dataset = local_dataset model = local_model time = local_time significance = local_significance FAPtype = local_FAPtype def _compute_dovl_configuration_performance_wrapper(cind): _, statistics = _compute_dovl_configuration_performance( dataset, cind, model, time, significance, FAPtype ) return statistics def _compute_dovl_configuration_performance( dataset, ind, model, time, significance, FAPtype ): """ computes this configuration's performance a convenient wrapper that is used within _recalculate and to avoid repeating code assumes both segments and gch_times are not empty (i.e.: there is data to veto) """ num_glitch, num_clean = dataset.vectors2num() # figure out what is vetoed by this configuration num_vetod_gch = 0.0 num_vetod_cln = 0.0 num_aux = 0.0 contained = [] configuration = model[ind] seg = ( np.array([-0.5, 0.5]) * configuration["window"] ) # will be used repeatedly within the loop for vector in dataset: # we reference the classifier data stored in each FeatureVector, which # is a bit of a cheat but should work... NOTE: if the dataset spans a # much larger amount of time than [vector.time - window/2, vector.time + # window/2] then the following is a substantial speed up compared to # generating a full list of segments repeatedly for each vector the # query to dataset may still be expensive (lots of I/O), so there's room # for improvement here aux_times = model._configuration2times( configuration, dataset, time, significance, ) if aux_times is None: continue # count how many loud things remain nearby naux = len(aux_times[utils.times_in_segments(aux_times, [seg + vector.time])]) if naux: # at least one loud thing close by contained.append(vector) num_aux += naux num_vetod_gch += vector.label num_vetod_cln += 1 - vector.label # convert datatable to dataset if contained: dset = features.Dataset.from_datatable( features.DataTable(rows=contained, names=vector.colnames), start=dataset.start, end=dataset.end, segs=dataset.segs, ) else: dset = features.Dataset(start=dataset.start, end=dataset.end, segs=dataset.segs) # compute ranking metrics # we assume num_glitch, num_clean > 0 or else this function would not be called eff = num_vetod_gch / num_glitch if FAPtype == "DOVL": fap = (num_vetod_cln + num_vetod_gch) / (num_clean + num_glitch) elif FAPtype == "DOVLfap": fap = num_vetod_cln / num_clean else: raise ValueError("FAPtype=%s not understood" % FAPtype) num_vetod_exp = num_glitch * fap poisson_signif = -_gammpln(num_vetod_gch, num_vetod_exp) # nothing vetod, so we give it a low ranking use_percentage = min(num_vetod_gch / num_aux, 1) if num_aux > 0 else 0.0 # handle this carefully so we don't throw things out that have fap=0 but eff>0 eff_fap = eff / fap if fap > 0 else np.infty if eff > 0 else 0.0 # return statistics = ( num_vetod_exp, num_vetod_gch, num_aux, num_vetod_cln, poisson_signif, use_percentage, eff, fap, eff_fap, ) return dset, statistics # --- OVL classes themselves
[docs]class DOVL(SupervisedClassifier): """ Discrete OVL: a modified version of OVL that trains based on discrete samples to estimate the deadtime. We note that there is an extension of this that estimates the False Alarm Probability instead of the deadtime (DOVLfap), but still uses discrete samples. The actual implementation of the OVL algorithm itself is stored in a subclass (:class:idq.classifiers.OVL) because DOVL has standard training signatures while OVL does not *WRITE ME* describe what we "overwrite" from OVL (although we don't really have convenient access to these...) train (a trivial delegation, and we overwrite it simply because we want the signature to have a clearer variable name) _recalculate """ _flavor = "dovl" _required_kwargs = [ "incremental", "minima", "num_recalculate", "metric", "safe_channels_path", "time", "significance", ] _allowed_metrics = "eff_fap use_percentage poisson_signif combined".split() _default_incremental = 1000 _default_minima = { "poisson_signif": 10, "eff_fap": 3, } def __init__(self, *args, **kwargs): SupervisedClassifier.__init__(self, *args, **kwargs) assert ( self.kwargs["metric"] in self._allowed_metrics ), "metric=%s not allowed" % (self.kwargs["metric"]) assert isinstance(self.kwargs["minima"], dict), "minima must be a dictionary" for k in self.kwargs["minima"].keys(): assert k in self._allowed_metrics, "minima's key=%s not allowed" % (k) # record the column names for "time" and "significance" self._time = self.kwargs.pop("time") self._significance = self.kwargs.pop("significance") @property def time(self): return self._time @property def significance(self): return self._significance
[docs] def evaluate(self, dataset): """ sets the ranks for these feature vectors modifies the objects in place! """ if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) verbose = self.kwargs.get("verbose", False) if verbose: template = ( "--- %6d / " + ("%6d" % len(dataset)) + " --- gps = %.6f --- label = %s ---" ) # load/cache triggers model = self.model if not dataset.is_configured(): dataset.configure(model.selector) dataset.load_data() # check required columns are available self._check_columns(dataset.features) # generate veto configuration veto_segs = [ model._configuration2segments(c, dataset, self._time, self._significance) for c in model ] # evaluate one feature vector at a time for ind, feature_vector in enumerate(dataset): if verbose: print(template % (ind + 1, feature_vector.gps, feature_vector.label)) # iterate over veto configurations for c_idx, configuration in enumerate(model): # check if configuration is active if utils.time_in_segments(feature_vector.time, veto_segs[c_idx]): feature_vector["rank"] = configuration["rank"] if verbose: print(model._repr_body % tuple(configuration)) break else: # we found nothing active, so set rank to zero feature_vector["rank"] = 0 if verbose: print("not found") feature_vector["hash"] = self.model.hash return dataset
[docs] def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None): """ delegates to Vetolist.timeseries returns ranks """ if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) dataset = dataset_factory.unlabeled(dt=dt, segs=segs) ranks = self.model.timeseries( dataset, self._time, self._significance, dt=dt, segs=segs, **self.kwargs ) return [ SeriesInfo.from_ranks( info, t[0], dt, rank, self.model, self.calibration_map, set_ok=set_ok ) for rank, t in ranks ]
[docs] def redundancies(self, dataset): """ computes the intersection and overlap of vetosegments for each possible configuration. This should contain all information necessary to determine which channels are redundant. we only return information based on the current model, which may have trained itself down to a subset of the total possible configurations returns table, headers table is a matrix of the livetime of the intersections of veto segments from each configuration pair headers is a list of the (channel, threshold, window) tuples, with the same order as the columns in table """ self._check_columns(dataset.features) if not self.is_trained: # instantiate the Vetolist if needed self.model = Vetolist( dataset.start, dataset.end, segs=dataset.segs, model_id=self._model_id, channels=io.path2channels(self.kwargs["safe_channels_path"]), significance=self.significance, **self.kwargs, ) return self.model.redundancies(dataset, self._time, self._significance)
[docs] def feature_importance(self): """ delegates to Vetolist.feature_importance """ if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) return self.model.feature_importance()
[docs] def feature_importance_figure(self, dataset, start, end, t0, **kwargs): """delegate to Vetolist.feature_importance_figure with a few extra things""" if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) return self.model.feature_importance_figure( dataset, start, end, t0, self._time, self._significance, colormap=self.kwargs.get("colormap", plots.DEFAULT_COLORMAP), # default to no colors nonerow=self.kwargs.get("nonerow", True), # default to use what is set by the colorbar nonecolor=self.kwargs.get("nonecolor", None), **kwargs, )
[docs] def feature_importance_table(self, dataset=None, **kwargs): """delegate to Vetolist.feature_importance_table""" return self.model.feature_importance_table( self._time, self._significance, dataset=dataset, **kwargs, )
def _check_columns(self, features): """ assert that required_columns are present in features """ for column in [self._time, self._significance]: for table in features.values(): assert column in table.colnames, "column=%s required" % column
[docs] def train( self, dataset ): # NOTE: the input signature here is the same as for other classifiers """ Instantiates a Vetolist and trains using the data within a dataset. Algorithmic parameters include: * channels * thresholds * windows * num_recalculate * incremental * minima (key, value pairs) and are specified through self.kwargs, set during instantiation. NOTE: We do not explicitly call `self._check_columns(datachunk)` and instead assume the user has already done this when constructing a dataset. """ # set up how we'll report models reporter = HDF5Reporter(self.rootdir, dataset.start, dataset.end) vetolistnickname = self.nickname + "Vetolist" model = Vetolist( dataset.start, dataset.end, segs=dataset.segs, model_id=self._model_id, channels=io.path2channels(self.kwargs["safe_channels_path"]), significance=self.significance, **self.kwargs, ) self.model = model # set up the internal model verbose = self.kwargs.get("verbose", False) # cache data so we get all I/O out of the way once if verbose: print("loading triggers within dataset") # configure feature selection and load if not dataset.is_configured(): dataset.configure(model.selector) dataset.load_data() # evaluate the first round without hierarchical removal # that's why we do all the jazz with incremental... if verbose: print("------------------- INITIAL RECALCULATE -----------------------") to = TIME.time() incremental = self.kwargs["incremental"] self.kwargs["incremental"] = 0 self._recalculate(dataset, **self.kwargs) # actually compute stuff self.kwargs["incremental"] = incremental if verbose: print("-------------------- %.6f sec" % (TIME.time() - to)) # write first iteration to disk reporter.report(vetolistnickname + "Initial", model.table) # re-order the list and prune under-performing configurations for metric, minimum in self.kwargs["minima"].items(): model.prune(minimum, metric=metric) model.reorder(**self.kwargs) # iterate for num_recalculate template = vetolistnickname + "Iter%d" for _ in range(self.kwargs["num_recalculate"]): if verbose: print( "------------------- RECALCULATE : %d -----------------------" % _ ) to = TIME.time() self._recalculate(dataset, **self.kwargs) # calculate stuff if verbose: print("------------------- %.6f sec" % (TIME.time() - to)) # write iteration to disk reporter.report(template % (_), model.table) # re-order and prune for metric, minimum in self.kwargs["minima"].items(): model.prune(minimum, metric=metric) model.reorder(**self.kwargs) # perform a final iteration and then write first iteration to disk if verbose: print("------------------- FINAL RECALCULATE -----------------------") to = TIME.time() self._recalculate(dataset, **self.kwargs) if verbose: print("------------------- %.6f sec" % (TIME.time() - to)) # re-order and write to disk # NOTE: we do NOT prune at this point model.reorder(**self.kwargs) reporter.report(vetolistnickname + "Final", model.table) # return self.model for syntactic consistency with parent return self.model
[docs] def calibrate(self, dataset, bounded=True, **kwargs): return super().calibrate(dataset, bounded=bounded, **kwargs)
def _recalculate( self, dataset, incremental=None, minima=None, multiprocessing_sleep=DEFAULT_MULTIPROCESSING_SLEEP, multiprocessing_chunksize=DEFAULT_MULTIPROCESSING_CHUNKSIZE, **kwargs, ): """ an implementation of the recalculate algorithm that uses clean samples instead of fractional deadtime Can we simply replace a "deadtime estimate" function call here and otherwise use the same logic as OVL itself? that may be complicated by NOTE: we expect each feature in the dataset to contain : columns = ['time', 'significance'] that is checked within train once instead of being checked here repeatedly """ # make a copy, sharing references to FeatureVectors do this so we can # pop crap from working_dataset without affecting dataset (shared # reference) working_dataset = dataset.copy() # do this because specifying in kwargs turns out to be difficult # syntactically if incremental is None: incremental = self._default_incremental if minima is None: minima = self._default_minima num_glitch, num_clean = dataset.vectors2num() cnum_vetod_gch = 0.0 cnum_vetod_cln = 0.0 model = self.model verbose = kwargs.get("verbose", False) if verbose: print(model._repr_head) _repr_body = model._repr_body + " %.6f sec" cind = 0 len_model = len(model) num_proc = self.kwargs.get("num_proc", DEFAULT_NUM_PROC) while (cind < len_model) and ( (incremental > 0) or (num_proc == 1) ): # process serially configuration = model[cind] if num_glitch and num_clean: # there's something to veto! if verbose: to = TIME.time() # compute statistics contained, statistics = _compute_dovl_configuration_performance( working_dataset, cind, model, self._time, self._significance, self.__class__.__name__, ) ( num_vetod_exp, num_vetod_gch, num_aux, num_vetod_cln, poisson_signif, use_percentage, eff, fap, eff_fap, ) = statistics # update configuration in place configuration["num_glitch"] = num_glitch configuration["num_clean"] = num_clean configuration["num_aux"] = num_aux configuration["num_vetod_exp"] = num_vetod_exp configuration["num_vetod_gch"] = num_vetod_gch configuration["num_vetod_cln"] = num_vetod_cln configuration["poisson_signif"] = poisson_signif configuration["use_percentage"] = use_percentage configuration["eff"] = eff configuration["fap"] = fap configuration["eff_fap"] = eff_fap # figure out if we'll keep this configuration moving forward if incremental: keep = True for metric, threshold in minima.items(): keep *= configuration[metric] >= threshold if keep: incremental -= 1 num_clean -= num_vetod_cln cnum_vetod_cln += num_vetod_cln num_glitch -= num_vetod_gch cnum_vetod_gch += num_vetod_gch working_dataset -= ( contained # retain only the vectors that were not flagged ) # NOTE: modifies dataset in place! # finish updating configuration configuration["cnum_vetod_gch"] = cnum_vetod_gch configuration["cnum_vetod_cln"] = cnum_vetod_cln if verbose: print(_repr_body % (tuple(configuration) + (TIME.time() - to,))) else: # we're missing data of some kind, so we should just skip the # rest of the configurations -> zero their contributions for name in model._table.dtype.names: if name not in [ "channel", "threshold", "window", "num_glitch", "num_clean", "cnum_vetod_gch", "cnum_vetod_cln", ]: model[cind:][name] = 0 model[cind:]["num_glitch"] = num_glitch model[cind:]["num_clean"] = num_clean model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch model[cind:]["cnum_vetod_cln"] = cnum_vetod_cln break cind += 1 if (cind < len_model) and num_glitch and num_clean: # we did not finish the list, so there could be something to do if verbose: print( "processing remaining configurations in parallel with %d processes" % num_proc ) pool = mp.Pool( num_proc, initializer=_compute_dovl_configuration_performance_initializer, initargs=( working_dataset, model, self._time, self._significance, self.__class__.__name__, ), ) if verbose: to = TIME.time() # set the things that will be the same for all remaning configurations model[cind:]["num_glitch"] = num_glitch model[cind:]["num_clean"] = num_clean model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch model[cind:]["cnum_vetod_cln"] = cnum_vetod_cln # process the rest of the channel's performances in parallel result = pool.map_async( _compute_dovl_configuration_performance_wrapper, range(cind, len(model)), chunksize=multiprocessing_chunksize, ) while not result.ready(): TIME.sleep(multiprocessing_sleep) if not result.successful(): raise RuntimeError("call to multiprocessing.pool.map_async failed") for ind, statistics in enumerate(result.get()): configuration = model[cind + ind] ( num_vetod_exp, num_vetod_gch, num_aux, num_vetod_cln, poisson_signif, use_percentage, eff, fap, eff_fap, ) = statistics # update configuration in place configuration["num_aux"] = num_aux configuration["num_vetod_exp"] = num_vetod_exp configuration["num_vetod_gch"] = num_vetod_gch configuration["num_vetod_cln"] = num_vetod_cln configuration["poisson_signif"] = poisson_signif configuration["use_percentage"] = use_percentage configuration["eff"] = eff configuration["fap"] = fap configuration["eff_fap"] = eff_fap if verbose: print(_repr_body % (tuple(configuration) + (TIME.time() - to,)))
[docs]class DOVLfap(DOVL): """ an extension of DOVL that estimates the FAP in a slightly different way. DOVL uses an approximation of the deadtime (comparable to what OVL does) fap=(num_vetod_gch+num_vetod_cln)/(num_gch+num_cln) DOVLfap uses an approximation close to the FAP fap=num_vetod_cln/num_clean while the FAP might appear more correct, the deadtime is likely to be more numerically stable no infinities and thresholds on poisson_signif may prune over-trained configureations this is controlled based on a conditional within _compute_dovl_configuration_performance """ _flavor = "dovlfap"
# --- classic OVL rather than the discrete equivalent def _compute_ovl_configuration_performance_initializer( local_segs, local_gch_times, local_model, local_dataset, local_time, local_significance, ): """should NOT be used outside of multiprocessing""" global segs, gch_times, model, dataset, time, significance segs = local_segs gch_times = local_gch_times model = local_model dataset = local_dataset time = local_time significance = local_significance def _compute_ovl_configuration_performance_wrapper(cind): """should NOT be used outside of multiprocessing""" _, _, statistics = _compute_ovl_configuration_performance( segs, gch_times, cind, model, dataset, time, significance ) return statistics def _compute_ovl_configuration_performance( segs, gch_times, cind, model, dataset, time, significance ): """ computes this configuration's performance a convenient wrapper that is used within _recalculate to avoid repeating code assumes both segments and gch_times are not empty (i.e.: there is data to veto) """ configuration = model[cind] window = configuration["window"] # NOTE: we do not filter by segs because that's all handled within # _compute_configuration_performance aux_segs = model._configuration2segments(configuration, dataset, time, significance) aux_segs &= segs num_glitch = len(gch_times) livetime = utils.livetime(segs) # NOTE: the old implementation of OVL did something like this num_aux = # np.sum(utils.times_in_segments(aux_times, segs)) # filter by segments to # get the count which could lead to the case where there was non-zero # livetime in the aux_segs while num_aux=0 instead, we now define num_aux as # the "effective number of triggers required to obtain the same amount of # aux livetime this could lead to slightly different performance compared to # the old implementation num_aux = utils.livetime(aux_segs) / window # figure out what is removed by this configuration # and which times are contained in the segments contained = utils.times_in_segments(gch_times, aux_segs) num_vetod_gch = 1.0 * np.sum(contained) aux_livetime = utils.livetime(aux_segs) # compute ranking metrics eff = num_vetod_gch / num_glitch fap = aux_livetime / livetime num_vetod_exp = num_glitch * fap # (num_glitch/livetime)*aux_livetime poisson_signif = -_gammpln(num_vetod_gch, num_vetod_exp) # nothing vetod, so we give it a low ranking use_percentage = min(num_vetod_gch / num_aux, 1) if num_aux > 0 else 0.0 # nothing vetod, so we give it a low ranking # note, we cannot have eff>0 with fap==0 within OVL, unlike within DOVL # return eff_fap = eff / fap if fap > 0 else 0.0 statistics = ( num_vetod_exp, num_vetod_gch, num_aux, aux_livetime, poisson_signif, use_percentage, eff, fap, eff_fap, ) return contained, aux_segs, statistics
[docs]class OVL(DOVL): """ a wrapper for the Ordered Veto List (OVL) algorithm published in Essick et al, CQG 30, 15 (2013) (DOI: 10.1088/0264-9381/30/15/155010) This algorithm estimates False Alarm Probability based on measures of the deadtime associated with segments generated around auxiliary events. *WRITE ME* set this up so it takes in a path to an output directory and then writes the Vetolist, data objects into that directory with appropriate names (extract start, end from data objects) describe inheritence for the extra attributes not declared within SupervisedClassifier _allowed_metrics _default_incremental _default_minima _gammln_cof _gammln_stp and the associated methods _recalculate redundancies _check_columns _gcf _gammln _gserln _gammpln """ _flavor = "ovl"
[docs] def train(self, dataset): """ Instantiates a Vetolist and trains using the data within a dataset. Algorithmic parameters include: * channels * thresholds * windows * num_recalculate * incremental * minima (key, value pairs) and are specified through self.kwargs, set during instantiation. """ # sanity check training data num_glitch, _ = dataset.vectors2num() if num_glitch == 0: raise ValueError( "training OVL-based classifiers not allowed with zero target times. " "this may indicate an issue with data discovery or not requesting " "enough time for training." ) # set up how we'll report models reporter = HDF5Reporter(self.rootdir, dataset.start, dataset.end) vetolistnickname = self.nickname + "Vetolist" model = Vetolist( dataset.start, dataset.end, segs=dataset.segs, model_id=self._model_id, channels=io.path2channels(self.kwargs["safe_channels_path"]), significance=self.significance, **self.kwargs, ) self.model = model # set up the internal model verbose = self.kwargs.get("verbose", False) if verbose: print("loading triggers within dataset") # configure feature selection and load if not dataset.is_configured(): dataset.configure(model.selector) dataset.load_data() # check required columns are available self._check_columns(dataset.features) # evaluate the first round without hierarchical removal # that's why we do all the jazz with incremental... if verbose: print("------------------- INITIAL RECALCULATE -----------------------") to = TIME.time() incremental = self.kwargs["incremental"] self.kwargs["incremental"] = 0 self._recalculate(dataset, **self.kwargs) # actually compute stuff self.kwargs["incremental"] = incremental if verbose: print("-------------------- %.6f sec" % (TIME.time() - to)) # write first iteration to disk reporter.report(vetolistnickname + "Initial", model.table) # re-order the list and prune under-performing configurations for metric, minimum in self.kwargs["minima"].items(): model.prune(minimum, metric=metric) model.reorder(**self.kwargs) # iterate for num_recalculate template = vetolistnickname + "Iter%d" for _ in range(self.kwargs["num_recalculate"]): if verbose: print( "------------------- RECALCULATE : %d -----------------------" % _ ) to = TIME.time() self._recalculate(dataset, **self.kwargs) # calculate stuff if verbose: print("------------------- %.6f sec" % (TIME.time() - to)) # write iteration to disk reporter.report(template % (_), model.table) # re-order and prune for metric, minimum in self.kwargs["minima"].items(): model.prune(minimum, metric=metric) model.reorder(**self.kwargs) # iterate once more and write final iteration to disk if verbose: print("------------------- FINAL RECALCULATE -----------------------") to = TIME.time() self._recalculate(dataset, **self.kwargs) # actually compute stuff if verbose: print("-------------------- %.6f sec" % (TIME.time() - to)) # NOTE: we do NOT prune at this point! model.reorder(**self.kwargs) reporter.report(vetolistnickname + "Final", model.table) # return self.model for syntactic consistency with parent return self.model
def _recalculate( self, dataset, incremental=None, minima=None, memory_light=False, multiprocessing_sleep=DEFAULT_MULTIPROCESSING_SLEEP, multiprocessing_chunksize=DEFAULT_MULTIPROCESSING_CHUNKSIZE, **kwargs, ): """ the main learning calculation associated with OVL calculate the performance of a set of ordered veto configurations on data does NOT re-order or prune the list for you; you must do that by hand assumes self.model already points to a Vetolist object NOTE: we expect the dataset to contain : columns = [self._time, self._significance] that is checked within train once instead of being checked here repeatedly """ # do this because specifying in kwargs turns out to be difficult syntactically if incremental is None: incremental = self._default_incremental if minima is None: minima = self._default_minima gch, _ = dataset.vectors2classes() gch_times = np.array(gch.times) # make a copy for internal consumption num_glitch = len(gch_times) cnum_vetod_gch = 0 # list of segments representing remaining time, will be built up # throughout the loop for hierarchical application of configurations segs = segmentlist( dataset.segs ) # make a copy so we don't update a shared reference accidentally livetime = utils.livetime(segs) cveto_livetime = 0 model = self.model verbose = kwargs.get("verbose", False) if verbose: print(model._repr_head) _repr_body = model._repr_body + " %.6f sec" cind = 0 len_model = len(model) num_proc = self.kwargs.get("num_proc", DEFAULT_NUM_PROC) while (cind < len_model) and ((incremental > 0) or (num_proc == 1)): # process serially, iterate over vetolist configuration = model[cind] if livetime and num_glitch: if verbose: to = TIME.time() # compute statistics ( contained, aux_segs, statistics, ) = _compute_ovl_configuration_performance( segs, gch_times, cind, model, dataset, self._time, self._significance, ) ( num_vetod_exp, num_vetod_gch, num_aux, aux_livetime, poisson_signif, use_percentage, eff, fap, eff_fap, ) = statistics # update configuration in place configuration["num_glitch"] = num_glitch configuration["num_clean"] = livetime configuration["num_aux"] = num_aux configuration["num_vetod_exp"] = num_vetod_exp configuration["num_vetod_gch"] = num_vetod_gch configuration["num_vetod_cln"] = aux_livetime configuration["poisson_signif"] = poisson_signif configuration["use_percentage"] = use_percentage configuration["eff"] = eff configuration["fap"] = fap configuration["eff_fap"] = eff_fap # figure out if we'll keep this configuration moving forward if incremental: keep = True for metric, threshold in minima.items(): keep *= configuration[metric] >= threshold if keep: incremental -= 1 livetime -= aux_livetime cveto_livetime += aux_livetime num_glitch -= num_vetod_gch cnum_vetod_gch += num_vetod_gch segs -= aux_segs # remove the vetod segments gch_times = gch_times[ np.logical_not(contained) ] # retain only the triggers that were not flagged # finish updating configuration configuration["cnum_vetod_gch"] = cnum_vetod_gch configuration["cnum_vetod_cln"] = cveto_livetime # pop this channel to free up memory if memory_light: dataset.features.pop( configuration["channel"] ) # garbage collector should gobble up the returned value if verbose: print(_repr_body % (tuple(configuration) + (TIME.time() - to,))) else: # we're missing data of some kind, so we should just skip the # rest of the configurations -> zero their contributions for name in model._table.dtype.names: if name not in [ "channel", "threshold", "window", "num_glitch", "num_clean", "cnum_vetod_gch", "cnum_vetod_cln", ]: model[cind:][name] = 0 model[cind:]["num_glitch"] = num_glitch model[cind:]["num_clean"] = livetime model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch model[cind:]["cnum_vetod_cln"] = cveto_livetime break cind += 1 # increment the counter if (cind < len_model) and num_glitch and livetime: # we did not finish the list before incremental ran out and we want # to run on more than one core if verbose: print( "processing remaining configurations in parallel with %d processes" % num_proc ) pool = mp.Pool( num_proc, initializer=_compute_ovl_configuration_performance_initializer, initargs=( segs, gch_times, model, dataset, self._time, self._significance, ), ) if verbose: to = TIME.time() # set the things that will be the same for all remaning configurations model[cind:]["num_glitch"] = num_glitch model[cind:]["num_clean"] = livetime model[cind:]["cnum_vetod_gch"] = cnum_vetod_gch model[cind:]["cnum_vetod_cln"] = cveto_livetime # process the rest of the channel's performances in parallel result = pool.map_async( _compute_ovl_configuration_performance_wrapper, range(cind, len(model)) ) while not result.ready(): TIME.sleep(multiprocessing_sleep) if not result.successful(): raise RuntimeError("call to multiprocessing.pool.map_async failed") for ind, statistics in enumerate(result.get()): configuration = model[cind + ind] ( num_vetod_exp, num_vetod_gch, num_aux, aux_livetime, poisson_signif, use_percentage, eff, fap, eff_fap, ) = statistics # update configuration in place configuration["num_aux"] = num_aux configuration["num_vetod_exp"] = num_vetod_exp configuration["num_vetod_gch"] = num_vetod_gch configuration["num_vetod_cln"] = aux_livetime configuration["poisson_signif"] = poisson_signif configuration["use_percentage"] = use_percentage configuration["eff"] = eff configuration["fap"] = fap configuration["eff_fap"] = eff_fap # pop this channel to free up memory if memory_light: dataset.features.pop( configuration["channel"] ) # garbage collector should gobble up the returned value if verbose: print(_repr_body % (tuple(configuration) + (TIME.time() - to,)))
@hookimpl def get_classifiers(): return { "ovl": OVL, "dovl": DOVL, "dovl-fap": DOVLfap, }