Source code for idq.classifiers

import numpy as np
import pluggy

from ligo.segments import segment, segmentlist

from .. import calibration
from .. import exceptions
from .. import names
from .. import utils


# the default sample rate for all calls to SupervisedClassifier.timeseries
DEFAULT_DT = 1.0 / 256  # sec


[docs]class ClassifierModel(object): """ a parent class that defines some basic attributes that all trained models must have to track data provenance each classifier will likely extend this class for their own purposes """ def __init__(self, start, end, segs=None, model_id=None, generate_id=False): self._start = start self._end = end if segs is None: segs = segmentlist([segment(start, end)]) self._segs = segs if generate_id: self._model_id = utils.generate_unique_id() else: self._model_id = model_id self._hash = None @property def start(self): return self._start @start.setter def start(self, new): self._start = new self._end = max(self._end, new) self._segs &= segmentlist([segment(self._start, self._end)]) @property def end(self): return self._end @end.setter def end(self, new): self._start = min(self._start, new) self._end = new self._segs &= segmentlist([segment(self._start, self._end)]) @property def segs(self): return self._segs @segs.setter def segs(self, new): self._segs = segmentlist(new) if len(self._segs): self._start = min(self._start, self._segs[0][0]) self._end = max(self._end, self._segs[-1][1]) @property def model_id(self): try: return self._model_id except AttributeError: try: # classifiers with v0.5 metadata return f"run{self._run_id}" except AttributeError: # pre v0.5 classifiers without any metadata return None @property def hash(self): """the identifier used to locate this model.""" if self._hash is not None: return self._hash else: return names.times_id2hash(self._start, self._end, self.model_id) @hash.setter def hash(self, new): self._hash = new
[docs] def feature_importance_figure(self, dataset, start, end, t0, **kwargs): """generate and return a figure demonstrating the feature importance based on the data within dataset; should return a figure object.""" raise NotImplementedError
[docs] def feature_importance_table(self, dataset, **kwargs): """should return (columns, data) compatible with the DQR's json.format_table (see use in idq/reports.py)""" raise NotImplementedError
# ------------------------------------------------- # Supervised Classification Objects # -------------------------------------------------
[docs]class SupervisedClassifier(object): """ a parent class for classifiers. Children should overwrite methods as necessary. This classifier will support everything required syntactically for the pipeline to function, but will assign random ranks to all events. """ _flavor = "supervised_classifier" _required_kwargs = [] # set during instantiation, but "declared" here for clarity kwargs = dict() def __init__(self, nickname, rootdir=".", model_id=None, **kwargs): self._nickname = nickname # stored in case the object wants to write intermediate data products to # disk, etc self.rootdir = rootdir self._model_id = model_id for kwarg in self._required_kwargs: assert kwarg in kwargs, "kwarg=%s required" % kwarg self.kwargs = kwargs # set up calibration map self._calibration_map = None # set up model self._model = None @property def nickname(self): """ this is a "private" variable because I don't ever want a user to muck with this once it is set upon instantiation """ return self._nickname @property def flavor(self): """ this is a "private" variable because I don't ever want a user to muck with this. I also want each child to have to declare this for themselves. this should be considered like a "type" but may be easier to deal with a string instead of a Type object """ return self._flavor @property def is_trained(self): return self._model is not None @property def is_calibrated(self): return self._calibration_map is not None @property def model(self): if not self.is_trained: raise exceptions.UntrainedError return self._model @model.setter def model(self, model): """ update the model with a stored description. WARNING: you should only use this if you really know what you're doing! """ self._model = model self._model_id = model.model_id @property def calibration_map(self): if not self.is_calibrated: raise exceptions.UncalibratedError return self._calibration_map @calibration_map.setter def calibration_map(self, calibration_map): """ update the calibration_map with a stored description WARNING: you should only use this if you really know what you're doing! """ self._calibration_map = calibration_map
[docs] def calibrate(self, dataset, **kwargs): """ calibrate this algorithm based on the dataset of feature vectors. requires all FeatureVectors in the dataset to have been evaluated This should update self._calibration_map """ assert dataset.is_evaluated(), "dataset has not yet been evaluated!" if not self.is_calibrated: # no previous calibration map if self.kwargs.get( "discrete_calibration", False ): # use a discrete calibration map self.calibration_map = calibration.DiscreteCalibrationMap( dataset, model_id=self._model_id, **kwargs ) else: self.calibration_map = calibration.CalibrationMap( dataset, model_id=self._model_id, **kwargs ) self.calibration_map.optimize(**kwargs) else: # add observations incrementally and rely on calibration_map to # auto_optimize as needed self.calibration_map.add_and_flush(dataset, **kwargs) return self.calibration_map
[docs] def train(self, dataset): """ This classifier does NOT use data to make predictions. Instead, it supports this method for syntactic completeness. Note: this does NOT update self._model, and therefore self.feature_importance will continue to raise exceptions """ # should point to the actual model, but we just set it to True so we can # identify that this classifier has been trained self.model = ClassifierModel( dataset.start, dataset.end, dataset.segs, model_id=self._model_id ) return self.model
[docs] def evaluate(self, dataset): """ This classifier assigns random ranks to all events independent of training data set. data should have the shape (Nsamples, Nfeatures) return an 1D array with length Nsamples representing the ranks assigned to each sample in data WARNING: this needs to be highly efficient if we're to use it to build time-series! """ if not self.is_trained: # not strictly necessary because we don't use a model here, but this # should mimic the behavior of real classifiers raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) for rank, feature_vector in zip(np.random.rand(len(dataset)), dataset): feature_vector.rank = rank feature_vector.hash = self.model.hash return dataset
[docs] def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None): """ returns ranks """ if segs is None: segs = dataset_factory.classifier_data.segs return [ (np.random.random(len(t)), t[0], dt) for t in utils.segs2times(segs, dt) ]
[docs] def feature_importance(self): """ return a ranked list of important features within the trained model will raise an UntrainedException if we do not have a trained model stored internally """ if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) return [] # place-holder. We don't use any features for this random Classifier
[docs] def feature_importance_figure(self, *args, **kwargs): """generate and return a figure demonstrating the feature importance based on the data within dataset factory; should return a figure object.""" if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) raise NotImplementedError
[docs] def feature_importance_table(self, *args, **kwargs): """should return (columns, data) compatible with the DQR's json.format_table (see use in idq/reports.py""" if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) raise NotImplementedError
[docs]class IncrementalSupervisedClassifier(SupervisedClassifier): """ An extension of SupervisedClassifier that is meant to re-train itself incrementally instead of a series of batch jobs (starting from scratch) should be able to inherit much of the functionality from SupervisedClassifier """ _flavor = "incremental_suprvised_classifier"
[docs] def train(self, dataset): """ this should incrementally update the internal model. Otherwise, the classifier's behavior should be the same as SupervisedClassifier """ # should point to the actual model, but we just set it to True so we can # identify that this classifier has been trained self.model = True return self.model
hookspec = pluggy.HookspecMarker("iDQ")
[docs]@hookspec def get_classifiers(): """ This hook is used to return SupervisedClassifiers in the form: {"type[:specifier]": Classifier} where the specifier (optional) refers to a flavor of that particular classifier for more specificity """
[docs]@hookspec def get_incremental_classifiers(): """ This hook is used to return IncrementalSupervisedClassifiers in the form: {"type[:specifier]": Classifier} where the specifier (optional) refers to a flavor of that particular classifier for more specificity """