Source code for idq.classifiers

import numpy as np
import pluggy

from ligo.segments import segment, segmentlist

from .. import calibration
from .. import exceptions
from .. import names
from .. import utils


# the default sample rate for all calls to SupervisedClassifier.timeseries
DEFAULT_DT = 1.0 / 256  # sec


[docs]class ClassifierModel(object):
    """
    a parent class that defines some basic attributes that all trained models
    must have to track data provenance each classifier will likely extend this
    class for their own purposes
    """

    def __init__(self, start, end, segs=None, model_id=None, generate_id=False):
        self._start = start
        self._end = end

        if segs is None:
            segs = segmentlist([segment(start, end)])
        self._segs = segs

        if generate_id:
            self._model_id = utils.generate_unique_id()
        else:
            self._model_id = model_id
        self._hash = None

    @property
    def start(self):
        return self._start

    @start.setter
    def start(self, new):
        self._start = new
        self._end = max(self._end, new)
        self._segs &= segmentlist([segment(self._start, self._end)])

    @property
    def end(self):
        return self._end

    @end.setter
    def end(self, new):
        self._start = min(self._start, new)
        self._end = new
        self._segs &= segmentlist([segment(self._start, self._end)])

    @property
    def segs(self):
        return self._segs

    @segs.setter
    def segs(self, new):
        self._segs = segmentlist(new)
        if len(self._segs):
            self._start = min(self._start, self._segs[0][0])
            self._end = max(self._end, self._segs[-1][1])

    @property
    def model_id(self):
        try:
            return self._model_id
        except AttributeError:
            try:
                # classifiers with v0.5 metadata
                return f"run{self._run_id}"
            except AttributeError:
                # pre v0.5 classifiers without any metadata
                return None

    @property
    def hash(self):
        """the identifier used to locate this model."""
        if self._hash is not None:
            return self._hash
        else:
            return names.times_id2hash(self._start, self._end, self.model_id)

    @hash.setter
    def hash(self, new):
        self._hash = new

[docs]    def feature_importance_figure(self, dataset, start, end, t0, **kwargs):
        """generate and return a figure demonstrating the feature importance
        based on the data within dataset; should return a figure
        object."""
        raise NotImplementedError

[docs]    def feature_importance_table(self, dataset, **kwargs):
        """should return (columns, data) compatible with the DQR's
        json.format_table (see use in idq/reports.py)"""
        raise NotImplementedError


# -------------------------------------------------
#        Supervised Classification Objects
# -------------------------------------------------


[docs]class SupervisedClassifier(object):
    """
    a parent class for classifiers. Children should overwrite methods as
    necessary. This classifier will support everything required syntactically
    for the pipeline to function, but will assign random ranks to all events.

    """

    _flavor = "supervised_classifier"
    _required_kwargs = []

    # set during instantiation, but "declared" here for clarity
    kwargs = dict()

    def __init__(self, nickname, rootdir=".", model_id=None, **kwargs):
        self._nickname = nickname
        # stored in case the object wants to write intermediate data products to
        # disk, etc
        self.rootdir = rootdir
        self._model_id = model_id

        for kwarg in self._required_kwargs:
            assert kwarg in kwargs, "kwarg=%s required" % kwarg
        self.kwargs = kwargs

        # set up calibration map
        self._calibration_map = None

        # set up model
        self._model = None

    @property
    def nickname(self):
        """
        this is a "private" variable because I don't ever want a user to muck
        with this once it is set upon instantiation
        """
        return self._nickname

    @property
    def flavor(self):
        """
        this is a "private" variable because I don't ever want a user to muck
        with this. I also want each child to have to declare this for
        themselves. this should be considered like a "type" but may be easier to
        deal with a string instead of a Type object
        """
        return self._flavor

    @property
    def is_trained(self):
        return self._model is not None

    @property
    def is_calibrated(self):
        return self._calibration_map is not None

    @property
    def model(self):
        if not self.is_trained:
            raise exceptions.UntrainedError
        return self._model

    @model.setter
    def model(self, model):
        """
        update the model with a stored description.
        WARNING:
            you should only use this if you really know what you're doing!
        """
        self._model = model
        self._model_id = model.model_id

    @property
    def calibration_map(self):
        if not self.is_calibrated:
            raise exceptions.UncalibratedError
        return self._calibration_map

    @calibration_map.setter
    def calibration_map(self, calibration_map):
        """
        update the calibration_map with a stored description
        WARNING:
            you should only use this if you really know what you're doing!
        """
        self._calibration_map = calibration_map

[docs]    def calibrate(self, dataset, **kwargs):
        """
        calibrate this algorithm based on the dataset of feature vectors.
        requires all FeatureVectors in the dataset to have been evaluated
        This should update self._calibration_map
        """
        assert dataset.is_evaluated(), "dataset has not yet been evaluated!"

        if not self.is_calibrated:  # no previous calibration map
            if self.kwargs.get(
                "discrete_calibration", False
            ):  # use a discrete calibration map
                self.calibration_map = calibration.DiscreteCalibrationMap(
                    dataset, model_id=self._model_id, **kwargs
                )
            else:
                self.calibration_map = calibration.CalibrationMap(
                    dataset, model_id=self._model_id, **kwargs
                )
                self.calibration_map.optimize(**kwargs)

        else:
            # add observations incrementally and rely on calibration_map to
            # auto_optimize as needed
            self.calibration_map.add_and_flush(dataset, **kwargs)

        return self.calibration_map

[docs]    def train(self, dataset):
        """
        This classifier does NOT use data to make predictions. Instead, it
        supports this method for syntactic completeness. Note: this does NOT
        update self._model, and therefore self.feature_importance will continue
        to raise exceptions
        """
        # should point to the actual model, but we just set it to True so we can
        # identify that this classifier has been trained
        self.model = ClassifierModel(
            dataset.start, dataset.end, dataset.segs, model_id=self._model_id
        )
        return self.model

[docs]    def evaluate(self, dataset):
        """
        This classifier assigns random ranks to all events independent of
        training data set. data should have the shape (Nsamples, Nfeatures)
        return an 1D array with length Nsamples representing the ranks assigned
        to each sample in data

        WARNING: this needs to be highly efficient if we're to use it to build
        time-series!
        """
        if not self.is_trained:
            # not strictly necessary because we don't use a model here, but this
            # should mimic the behavior of real classifiers
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        for rank, feature_vector in zip(np.random.rand(len(dataset)), dataset):
            feature_vector.rank = rank
            feature_vector.hash = self.model.hash
        return dataset

[docs]    def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None):
        """
        returns ranks
        """
        if segs is None:
            segs = dataset_factory.classifier_data.segs
        return [
            (np.random.random(len(t)), t[0], dt) for t in utils.segs2times(segs, dt)
        ]

[docs]    def feature_importance(self):
        """
        return a ranked list of important features within the trained model will
        raise an UntrainedException if we do not have a trained model stored
        internally
        """
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        return []  # place-holder. We don't use any features for this random Classifier

[docs]    def feature_importance_figure(self, *args, **kwargs):
        """generate and return a figure demonstrating the feature importance
        based on the data within dataset factory; should return a figure
        object."""
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        raise NotImplementedError

[docs]    def feature_importance_table(self, *args, **kwargs):
        """should return (columns, data) compatible with the DQR's
        json.format_table (see use in idq/reports.py"""
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )
        raise NotImplementedError


[docs]class IncrementalSupervisedClassifier(SupervisedClassifier):
    """
    An extension of SupervisedClassifier that is meant to re-train itself
    incrementally instead of a series of batch jobs (starting from scratch)
    should be able to inherit much of the functionality from
    SupervisedClassifier
    """

    _flavor = "incremental_suprvised_classifier"

[docs]    def train(self, dataset):
        """
        this should incrementally update the internal model. Otherwise, the
        classifier's behavior should be the same as SupervisedClassifier
        """
        # should point to the actual model, but we just set it to True so we can
        # identify that this classifier has been trained
        self.model = True
        return self.model


hookspec = pluggy.HookspecMarker("iDQ")


[docs]@hookspec
def get_classifiers():
    """
    This hook is used to return SupervisedClassifiers in the form:
        {"type[:specifier]": Classifier}

    where the specifier (optional) refers to a flavor of that
    particular classifier for more specificity

    """


[docs]@hookspec
def get_incremental_classifiers():
    """
    This hook is used to return IncrementalSupervisedClassifiers in the form:
        {"type[:specifier]": Classifier}

    where the specifier (optional) refers to a flavor of that
    particular classifier for more specificity

    """