Source code for idq.classifiers.sklearn

import logging
import timeit
import warnings

import numpy as np
from scipy.stats import randint, reciprocal, uniform

from ligo.segments import segmentlist

from sklearn import linear_model as sklearn_linear_model
from sklearn import naive_bayes as sklearn_naive_bayes
from sklearn import svm as sklearn_svm
from sklearn import neural_network as sklearn_neural_network
from sklearn import pipeline as sklearn_pipeline
from sklearn import model_selection as sklearn_model_selection
from sklearn import preprocessing as sklearn_preprocessing
from sklearn import kernel_approximation as sklearn_kernel_approximation
from sklearn.utils import class_weight

# catch weight boosting deprecation warning
with warnings.catch_warnings():
    warnings.simplefilter("ignore", DeprecationWarning)
    from sklearn import ensemble as sklearn_ensemble

from .. import exceptions
from .. import hookimpl
from .. import features
from .. import io
from ..series import SeriesInfo

from . import ClassifierModel, SupervisedClassifier, IncrementalSupervisedClassifier
from . import DEFAULT_DT


logger = logging.getLogger("idq")


DEFAULT_NUM_CV_PROC = 1  # sets number of parallel jobs for CV
DEFAULT_CV_SCORING = "roc_auc"  # sets scoring metric for CV
DEFAULT_NUM_CV_FOLDS = 5  # sets number of folds used for CV
DEFAULT_NUM_SAMPLES = 10  # sets number of parameters to sample for randomized CV
DEFAULT_MAX_TRAIN_RANK_SCALE = (
    0.99  # sets max value of ranks for scaling found in training
)
DEFAULT_COLUMN_VALUE = 0
DEFAULT_WHITENER = "standard"

# -------------------------------------------------
# classifier implementations


[docs]class SklearnModel(ClassifierModel):
    """a base model class for all sklearn classifiers"""

    def __init__(
        self,
        start,
        end,
        pipeline,
        channels,
        downselector,
        transformer,
        time=features.DEFAULT_TIME_NAME,
        bounds=None,
        segs=None,
        model_id=None,
        generate_id=False,
    ):
        ClassifierModel.__init__(
            self, start, end, segs=segs, model_id=model_id, generate_id=generate_id
        )

        # ensure the classifier returns probabilistic estimates or
        # some mechanism for returning a continuous range
        if not (
            hasattr(pipeline.named_steps["classifier"], "predict_proba")
            or hasattr(pipeline.named_steps["classifier"], "decision_function")
        ):
            raise AttributeError(
                "%s has no predict_proba or decision_function attribute'%self.flavor"
                % self.flavor
            )
        self._scale_ranks = not hasattr(
            pipeline.named_steps["classifier"], "predict_proba"
        )
        self._sklearn = pipeline

        # set up how features are extracted and transformed
        if not bounds:
            bounds = {}

        self._selector = features.Selector(
            channels=channels,
            time=time,
            bounds=bounds,
            downselector=downselector,
            transformer=transformer,
        )

    @property
    def sklearn(self):
        return self._sklearn

    @property
    def channels(self):
        return self._selector.channels

    @property
    def time(self):
        return self._selector.time

    @property
    def selector(self):
        return self._selector

    @property
    def hyperparameters(self):
        return self._sklearn.get_params()

    def fit(self, data, labels, **kwargs):
        self.sklearn.fit(data, labels)

        # if using CV, only grab estimator with best hyperparameters
        if "search" in kwargs:
            self._sklearn = self.sklearn.best_estimator_

        # if rank scaling needs to be done, also train with output from
        # classifier as well
        if self._scale_ranks:
            unscaled_ranks = self.sklearn.decision_function(data)
            self._scaler = sklearn_preprocessing.FunctionTransformer(
                func=_rank_scaler,
                kw_args={"scale": _rank_scaler_scale(unscaled_ranks)},
            )
            self._scaler.fit(unscaled_ranks.reshape(-1, 1))  # reshape to 2d array

    def ranks(self, data):
        # decide how to evaluate model based on defined attributes
        if self._scale_ranks:
            unscaled_ranks = self.sklearn.decision_function(data)
            ranks = self._scaler.transform(
                unscaled_ranks.reshape(-1, 1)
            )  # reshape to 2d array
            ranks = ranks[:, 0]  # pull out the scaled ranks
        else:
            ranks = self.sklearn.predict_proba(data)
            ranks = ranks[:, 1]  # only pull out the "class-1" rank

        return ranks


[docs]class SupervisedSklearnClassifier(SupervisedClassifier):
    """
    a base class for supervised sk-learn classifiers, which
    contains simple implementations for supervised classifier methods.

    Note: not to be used as a standalone classifier.
    """

    _flavor = "sklearn_supervised_classifier"
    _required_kwargs = ["safe_channels_path", "window", "time", "significance"]

    def __init__(self, *args, **kwargs):
        SupervisedClassifier.__init__(self, *args, **kwargs)

[docs]    def train(self, dataset):
        """
        Trains a supervised scikit-learn model to feature data using a labeled
        dataset.
        """
        verbose = self.kwargs.get("verbose", False)

        # sanity check training data
        num_glitch, num_clean = dataset.vectors2num()
        if num_glitch == 0 or num_clean == 0:
            raise ValueError(
                "training scikit-based classifiers not allowed with zero target"
                "or clean (random) times. this may indicate an issue with data"
                "discovery or not requesting enough time for training."
            )

        self.class_weights = class_weight.compute_class_weight(
            "balanced", classes=np.array([0, 1]), y=dataset.labels
        )

        # create and train model
        self.model = self._set_up_model(dataset, verbose=verbose)

        start_time = timeit.default_timer()
        logger.info("generating dataset")

        # configure feature selection and load
        if not dataset.is_configured():
            dataset.configure(self.model.selector)
        dataset.load_data(verbose=verbose)

        elapsed = timeit.default_timer() - start_time
        logger.debug("quiver generation took %.2f sec" % elapsed)

        start_time = timeit.default_timer()
        logger.info("training model")

        self.model.fit(
            dataset.features.as_unstructured(), dataset.labels, **self.kwargs
        )

        elapsed = timeit.default_timer() - start_time
        logger.debug("model training took %.2f sec" % elapsed)

        logger.debug("model hyperparameters:")
        pipeline_steps = [
            est.__class__.__name__ for _, est in self.model.hyperparameters["steps"]
        ]
        pipeline_names = [name for name, _ in self.model.hyperparameters["steps"]]
        for param, val in self.model.hyperparameters.items():
            if param == "steps":
                pipeline_str = "[" + ", ".join(pipeline_steps) + "]"
                logger.debug("    pipeline: %s" % pipeline_str)
            elif param not in (["memory"] + pipeline_names):
                logger.debug("    {param}: {val}".format(param=param, val=val))

        return self.model

[docs]    def evaluate(self, dataset):
        """
        Applies a supervised scikit-learn model to feature data. Takes in an unlabeled
        dataset and produces ranks for each feature vector within that dataset.
        """
        verbose = self.kwargs.get("verbose", False)

        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )

        start_time = timeit.default_timer()
        logger.info("generating dataset")

        # configure feature selection and load
        if not dataset.is_configured():
            dataset.configure(self.model.selector)
        dataset.load_data(verbose=verbose)

        elapsed = timeit.default_timer() - start_time
        logger.debug("quiver generation took %.2f sec" % elapsed)

        start_time = timeit.default_timer()
        logger.info("evaluating model")

        # evaluate dataset
        ranks = self.model.ranks(dataset.features.as_unstructured())
        dataset.evaluate(ranks, hashes=self.model.hash)

        elapsed = timeit.default_timer() - start_time
        logger.debug("model evaluation took %.2f sec" % elapsed)

        return dataset

[docs]    def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None):
        """
        Generate a time series of predictions based on predicted model probabilities.
        """
        # check if model has been trained
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )

        if segs is None:
            segs = dataset_factory.classifier_data.segs

        ranks = []
        for seg in segs:
            dataset = dataset_factory.unlabeled(dt=dt, segs=segmentlist([seg]))
            if len(dataset):
                # append ranks if dataset isn't empty
                ranks.append(
                    SeriesInfo.from_ranks(
                        info,
                        dataset.times[0],
                        dt,
                        self.evaluate(dataset).ranks,
                        self.model,
                        self.calibration_map,
                        set_ok=set_ok,
                    )
                )
        return ranks

[docs]    def feature_importance(self):
        """
        Retrieve feature importances from a classifier if it has such a method.
        """
        # check if model has been trained
        if not self.is_trained:
            raise exceptions.UntrainedError(
                "%s does not have an internal model" % self.flavor
            )

        # check if sklearn classifier either has feature_importances_ or coef_
        # properties
        importances = getattr(self.model.sklearn, "feature_importances_", None)

        if not importances and hasattr(self.model.sklearn, "coef_"):
            if self.model.sklearn.coef_.ndim == 1:
                importances = np.abs(self.model.sklearn.coef_)
            else:
                importances = np.sum(np.abs(self.model.sklearn.coef_), axis=0)

        elif not importances:
            raise ValueError(
                "feature_importance is not implemented for %s" % self.flavor
            )

        return importances

    def _create_model(self, dataset, channels):
        return SklearnModel(
            dataset.start,
            dataset.end,
            pipeline=self._create_pipeline(**self.kwargs.get("params", {})),
            channels=channels,
            downselector=features.DownselectLoudest(**self.kwargs),
            transformer=features.DeltaTimeTransformer(**self.kwargs),
            bounds=self.kwargs["bounds"],
            time=self.kwargs.get("time", features.DEFAULT_TIME_NAME),
            segs=dataset.segs,
            model_id=self._model_id,
        )

    def _set_up_model(self, dataset, verbose=False):
        """
        Sets up a scikit-learn classifier model, using keyword arguments as necessary.
        Used internally
        """
        # set defaults for missing values if not set
        self.kwargs["default"] = self.kwargs.get("default", DEFAULT_COLUMN_VALUE)
        self.kwargs["default_delta_time"] = self.kwargs.get(
            "default_delta_time", -self.kwargs["window"]
        )

        # set quantities used for calculating layer sizes
        channels = io.path2channels(self.kwargs["safe_channels_path"])
        self.num_channels = len(channels)
        self.num_columns = len(dataset._dataloader.columns)
        self.num_features = self.num_channels * self.num_columns

        # create the model
        model = self._create_model(dataset, channels)

        logger.debug("model parameters:")
        pipeline_steps = [
            est.__class__.__name__ for _, est in model.sklearn.get_params()["steps"]
        ]
        pipeline_names = [name for name, _ in model.sklearn.get_params()["steps"]]
        for param, val in model.sklearn.get_params().items():
            if param == "steps":
                pipeline_str = "[" + ", ".join(pipeline_steps) + "]"
                logger.info("model pipeline: %s" % pipeline_str)
            elif param not in (["memory"] + pipeline_names):
                logger.debug("    {param}: {val}".format(param=param, val=val))

        # use cross-validation if specified
        if "search" in self.kwargs and self.kwargs["search"]["type"] == "grid":
            # create hyperparameter grid given ranges specified
            param_grid = {}
            for param, conf in self.kwargs["search"]["params"].items():
                lo, hi = conf["range"]
                num_samp = conf["num_samples"]
                dist = conf["type"]
                type_ = "discrete" if conf["discrete"] else "continuous"
                param_grid[param] = self._generate_grid(lo, hi, num_samp, dist, type_)

            logger.info("grid-based hyperparameter search over:")
            for hyperparam, val in param_grid.items():
                logger.info(
                    "    {param}: {val}".format(param=hyperparam, val=repr(val))
                )
            logger.info(
                "    for %d total searches"
                % np.prod([len(val) for val in param_grid.values()])
            )

            model._sklearn = sklearn_model_selection.GridSearchCV(
                model._sklearn,
                n_jobs=self.kwargs.get("num_cv_proc", DEFAULT_NUM_CV_PROC),
                cv=self.kwargs.get("num_cv_folds", DEFAULT_NUM_CV_FOLDS),
                scoring=self.kwargs.get("cv_scoring", DEFAULT_CV_SCORING),
                param_grid=param_grid,
                refit=True,
                verbose=int(verbose),
            )

        if "search" in self.kwargs and self.kwargs["search"]["type"] == "random":
            # create distributions for random sampling
            num_samples = self.kwargs["search"].get("num_samples", DEFAULT_NUM_SAMPLES)
            param_dist = {}
            for param, conf in self.kwargs["search"]["params"].items():
                lo, hi = conf["range"]
                dist = conf["type"]
                type_ = "discrete" if conf["discrete"] else "continuous"
                param_dist[param] = self._generate_distribution(lo, hi, dist, type_)

            logger.info("random sampling-based hyperparameter search over:")
            for hyperparam, val in param_dist.items():
                logger.info(
                    "    {param}: {val}".format(param=hyperparam, val=val.dist.name)
                )
            logger.info("    for %d total searches" % num_samples)

            model._sklearn = sklearn_model_selection.RandomizedSearchCV(
                model._sklearn,
                n_iter=num_samples,
                n_jobs=self.kwargs.get("num_cv_proc", DEFAULT_NUM_CV_PROC),
                cv=self.kwargs.get("num_cv_folds", DEFAULT_NUM_CV_FOLDS),
                scoring=self.kwargs.get("cv_scoring", DEFAULT_CV_SCORING),
                random_state=self.kwargs.get("random_state", None),
                param_distributions=param_dist,
                refit=True,
                verbose=int(verbose),
            )

        return model

    def _create_pipeline(self, **kwargs):
        # build all parts of pipeline needed in the correct order
        pipeparts = self.preprocessor()
        classifier = self.classifier()
        pipeparts.extend(classifier)

        # filter kwargs to pass in specific kwargs only
        classifier_kwargs = {}
        for name, _ in classifier:
            estimator_kwargs = {
                kwarg: value
                for kwarg, value in kwargs.items()
                if "%s__" % name in kwarg
            }
            classifier_kwargs.update(estimator_kwargs)

        # build pipeline, set parameters
        pipe = sklearn_pipeline.Pipeline(pipeparts)
        pipe.set_params(**classifier_kwargs)

        # set global random state if one is provided
        if "random_state" in kwargs:
            random_state_params = {
                kwarg: kwargs["random_state"]
                for kwarg, _ in pipe.get_params().items()
                if "random_state" in kwarg
            }
            pipe.set_params(**random_state_params)

        return pipe

    def classifier(self):
        return [("classifier", sklearn_linear_model.Perceptron())]

    def preprocessor(self):
        preprocessor = []

        whitener = self.kwargs.get("whitener", DEFAULT_WHITENER)
        if whitener == "robust":
            preprocessor.append(("whitener", sklearn_preprocessing.RobustScaler()))
        elif whitener == "standard":
            preprocessor.append(("whitener", sklearn_preprocessing.StandardScaler()))
        else:
            raise ValueError(
                "%s is not an available whitener option" % self.kwargs["whitener"]
            )

        return preprocessor

    def _generate_distribution(self, param_min, param_max, distribution, dist_type):
        """
        Used internally to generate a distribution based on hyperparameter
        bounds and a distribution suitable for use in randomized search
        cross-validation schemes.
        """
        if dist_type == "continuous":
            if distribution == "log_uniform":
                return reciprocal(a=param_min, b=param_max)
            elif distribution == "uniform":
                return uniform(loc=param_min, scale=(param_max - param_min))
            else:
                raise NotImplementedError

        elif dist_type == "discrete":
            if distribution == "uniform":
                return randint(param_min, param_max)
            else:
                raise NotImplementedError

        else:
            raise ValueError("valid distribution types are 'continuous' or 'discrete'")

    def _generate_grid(
        self, param_min, param_max, num_samples, distribution, dist_type
    ):
        """
        Used internally to generate a grid based on hyperparameter bounds and a
        number of grid points suitable for use in grid search cross-validation
        schemes.
        """
        if dist_type == "continuous":
            if distribution == "log_uniform":
                return np.logspace(
                    np.log10(param_min), np.log10(param_max), num_samples
                )
            elif distribution == "uniform":
                return np.linspace(param_min, param_max, num_samples)
            else:
                raise NotImplementedError

        elif dist_type == "discrete":
            if distribution == "uniform":
                start = param_min
                end = param_max + 1  # endpoint is inclusive in arange

                # calculate approximate step size based on num_samples passed in,
                # rounds to the nearest integer step size but with a min step size of 1
                max_num_samples = min(num_samples, end - start)
                step = int(np.round(float(end - start) / max_num_samples))

                return np.arange(start, end, step=step, dtype=int)
            else:
                raise NotImplementedError

        else:
            raise ValueError("valid distribution types are 'continuous' or 'discrete'")


[docs]class IncrementalSupervisedSklearnClassifier(
    IncrementalSupervisedClassifier, SupervisedSklearnClassifier
):
    """
    a base class for incremental supervised sk-learn classifiers, which contains
    simple implementations for incremental supervised classifier methods.

    Note: not to be used as a standalone classifier.
    """

    _flavor = "sklearn_incremental_supervised_classifier"
    _required_kwargs = ["safe_channels_path", "window", "time", "significance"]

[docs]    def train(self, dataset):
        """
        Trains a supervised scikit-learn model to feature data using a labeled dataset.
        """
        # create and train model
        firsttime = (
            not self.is_trained
        )  # whether this is the first time this has been called
        if firsttime:
            self.model = self._set_up_model(dataset)
        else:
            self.model.start = min(self.model.start, dataset.start)
            self.model.end = max(self.model.end, dataset.end)
            self.model.segs |= dataset.segs

        data = dataset.features(
            self.model.channels,
            self.model.select,
            time=self.model.time,
        )

        self.model.sklearn.partial_fit(
            data.to_unstructured(),
            dataset.labels,
            classes=[0.0, 1.0],
        )

        return self.model

[docs]    def evaluate(self, dataset):
        """
        Applies a supervised sk-learn model to feature data.
        """
        return SupervisedSklearnClassifier.evaluate(dataset)

[docs]    def timeseries(self, *args, **kwargs):
        """
        Generate a time series of predictions based on predicted model
        probabilities
        """
        return SupervisedSklearnClassifier.timeseries(*args, **kwargs)

[docs]    def feature_importance(self):
        """
        Estimates the likelihood ratio of each feature over the training data
        set bootstrap/importance sample to estimate the average importance
        """
        return SupervisedSklearnClassifier.feature_importance()

    def _create_pipeline(self, **kwargs):
        return SupervisedSklearnClassifier._create_pipeline()


[docs]class NaiveBayes(SupervisedSklearnClassifier):
    """
    A Naive Bayes classifier based on scikit-learn.

    This is a supervised learning algorithm which assumes independence between
    all features, and uses Bayes' theorem to determine the posterior probability
    that a set of features is in a given class. In this particular
    implementation, the likelihood of features are Gaussian in form.

    * `Gaussian Naive Bayes User's Guide
    <http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes>`_.

    * `Gaussian Naive Bayes API
    <http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>`_.

    """

    _flavor = "naive_bayes"

[docs]    def classifier(self):
        """
        Create a Gaussian Naive Bayes classifer
        """
        return [("classifier", sklearn_naive_bayes.GaussianNB())]


[docs]class RandomForest(SupervisedSklearnClassifier):
    """
    A Random Forest of Decision Trees based on scikit-learn.

    This is a supervised learning algorithm which uses a group of randomized
    decision trees (a forest) to perform classification.

    * `Random Forest User's Guide
    <http://scikit-learn.org/stable/modules/ensemble.html#forest>`_.

    * `Random Forest API
    <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier>`_.

    """

    _flavor = "random_forest"

[docs]    def classifier(self):
        """
        Create a random forest classifier
        """
        return [("classifier", sklearn_ensemble.RandomForestClassifier())]


[docs]class SupportVectorMachine(SupervisedSklearnClassifier):
    """
    A support vector machine based on scikit-learn.

    This is a supervised learning algorithm which uses a hyperplane to separate
    data points into two distinct classes. It also allows for kernel-based
    learning, so that if samples aren't appropriate to be separated by a
    hyperplane, samples gets transformed via a kernel to a higher-dimensional
    space where samples can be separated in a linear fashion.

    Various kernels are supported and can be passed in by passing in the kernel
    kwarg to the classifier configuration section.

    NOTE: The scikit-learn classifier, SVC, is used to perform classification.
    Probability is set to true so that the mapping between rank to a calibrated
    probability can be performed more easily.

    * `SVM User's Guide <scikit-learn.org/stable/modules/svm.html>`_.

    * `SVM API
    <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC>`_.

    """

    _flavor = "support_vector_machine"

[docs]    def classifier(self):
        """
        Creates a support vector machine classifier
        """
        return [("classifier", sklearn_svm.SVC(probability=True))]


[docs]class GradientBoostedTree(SupervisedSklearnClassifier):
    """
    A Gradient Tree Boosting algorithm based on scikit-learn.

    This is a supervised learning algorithm which produces an ensemble of
    decision trees, builds them up in a stage-wise fashion, and allows use of
    arbitrary differentiable loss functions.

    * `GBT User's Guide
    <http://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting>`_.

    * `GBT API
    <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier>`_.

    """

    _flavor = "gradient_boosted_tree"

[docs]    def classifier(self):
        """
        Creates a gradient boosted tree classifier.
        """
        return [("classifier", sklearn_ensemble.GradientBoostingClassifier())]


[docs]class NeuralNetwork(SupervisedSklearnClassifier):
    """
    A neural network (multi-layer perception) algorithm based on scikit-learn.

    This is a supervised learning algorithm which produces a shallow neural
    network of multiple layers with a choice of activation function for the
    hidden layers. It trains itself using backpropagation.

    * `MultiLayer Perceptron User's Guide
    <http://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron>`_.

    * `MultiLayer Perceptron API
    <http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier>`_.

    """

    _flavor = "neural_network"

[docs]    def classifier(self):
        """
        Creates a multilayer perceptron classifier.
        """
        return [("classifier", sklearn_neural_network.MLPClassifier())]


[docs]class ApproximateKernelSGD(SupervisedSklearnClassifier):
    """
    A Stochastic Gradient Descent classifier based on scikit-learn, with a
    choice of an approximate kernel to transform nonlinear features into linear
    features suitable for the SDG classifier.

    Guide for using the Stochastic Gradient Descent classifier:

    * `SGD User's Guide
    <http://scikit-learn.org/stable/modules/sgd.html#stochastic-gradient-descent>`_.

    * `SGD API
    <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn-linear-model-sgdclassifier>`_.

    Guide for the approximate kernel algorithm (using the Nystroem method),
    types of kernels and appropriate parameters:

    * `Kernel Approximation User's Guide
    <http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_.

    * `Kernel Approximation API
    <http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_.

    """

    _flavor = "approximate_kernel_SGD"

[docs]    def classifier(self):
        """
        Creates an approximate kernel SGD classifier.
        """
        kernel = ("kernel", sklearn_kernel_approximation.Nystroem())
        classifier = ("classifier", sklearn_linear_model.SGDClassifier())
        return [kernel, classifier]


[docs]class ApproximateKernelSVM(SupervisedSklearnClassifier):
    """
    A linear SVM based on scikit-learn, with a choice of an approximate kernel
    to transform nonlinear features into linear features suitable for the SVM
    classifier.

    Guide for using the linear SVM classifier:

    * `SVM User's Guide
    <http://scikit-learn.org/stable/modules/svm.html#support-vector-machines>`_.

    * `Linear SVM API
    <http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC>`_.

    Guide for the approximate kernel algorithm (using the Nystroem method),
    types of kernels and appropriate parameters:

    * `Kernel Approximation User's Guide
    <http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_.

    * `Kernel Approximation API
    <http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_.

    """

    _flavor = "approximate_kernel_SVM"

[docs]    def classifier(self):
        """
        Creates a approximate kernel SVM classifier.
        """
        kernel = ("kernel", sklearn_kernel_approximation.Nystroem())
        classifier = ("classifier", sklearn_svm.LinearSVC())
        return [kernel, classifier]


[docs]class IncrementalNaiveBayes(IncrementalSupervisedSklearnClassifier):
    """
    A Naive Bayes classifier based on scikit-learn. Trains in an incremental
    fashion.

    This is a supervised learning algorithm which assumes independence between
    all features, and uses Bayes' theorem to determine the posterior probability
    that a set of features is in a given class. In this particular
    implementation, the likelihood of features are Gaussian in form.

    * `Gaussian Naive Bayes User's Guide
    <http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes>`_.

    * `Gaussian Naive Bayes API
    <http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>`_.

    """

    _flavor = "incremental_naive_bayes"

[docs]    def classifier(self):
        """
        Creates a Gaussian Naive Bayes classifier.
        """
        return [("classifier", sklearn_naive_bayes.GaussianNB())]


[docs]class PassiveAggressive(IncrementalSupervisedSklearnClassifier):
    """
    A Passive-Aggressive classifier based on scikit-learn. Trains in an
    incremental fashion.

    Based off of
    <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>`_.

    * `Passive-Aggressive User's Guide
    <http://scikit-learn.org/stable/modules/linear_model.html#passive-aggressive>`_.

    * `Passive-Aggressive API
    <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn-linear-model-passiveaggressiveclassifier>`_.

    """

    _flavor = "passive_aggressive"

[docs]    def classifier(self):
        """
        Creates a Passive-Aggressive classifier.
        """
        return [("classifier", sklearn_linear_model.PassiveAggressive())]


[docs]class IncrementalNeuralNetwork(IncrementalSupervisedSklearnClassifier):
    """
    A Multilayer Perception (neural network) algorithm based on scikit-learn.
    Trains in an incremental fashion.

    This is a supervised learning algorithm which produces a shallow neural
    network of multiple layers with a choice of activation function for the
    hidden layers. It trains itself using backpropagation.

    * `MultiLayer Perceptron User's Guide
    <http://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron>`_.

    * `MultiLayer Perceptron API
    <http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier>`_.

    """

    _flavor = "incremental_multilayer_perceptron"

[docs]    def classifier(self):
        """
        Creates a multilayer perceptron classifier.
        """
        return [("classifier", sklearn_neural_network.MLPClassifier())]


[docs]class IncrementalApproximateKernelSGD(IncrementalSupervisedSklearnClassifier):
    """
    A Stochastic Gradient Descent classifier based on scikit-learn, with a
    choice of an approximate kernel to transform nonlinear features into linear
    features suitable for the SDG classifier. Trains in an incremental fashion.

    Guide for using the Stochastic Gradient Descent classifier:

    * `SGD User's Guide
    <http://scikit-learn.org/stable/modules/sgd.html#stochastic-gradient-descent>`_.

    * `SGD API
    <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn-linear-model-sgdclassifier>`_.

    Guide for the approximate kernel algorithm (using the Nystroem method),
    types of kernels and appropriate parameters:

    * `Kernel Approximation User's Guide
    <http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_.

    * `Kernel Approximation API
    <http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_.

    """

    _flavor = "incremental_approximate_kernel_SGD"

[docs]    def classifier(self):
        """
        Creates a approximate kernel SGD classifier.
        """
        kernel = ("kernel", sklearn_kernel_approximation.Nystroem())
        classifier = ("classifier", sklearn_linear_model.SGDClassifier())
        return [kernel, classifier]


# -------------------------------------------
# utilities for rank scaling


def _rank_scaler(ranks, scale=1):
    return 0.5 * (np.tanh(ranks / scale) + 1)


def _rank_scaler_scale(ranks, max_train_rank_scale=DEFAULT_MAX_TRAIN_RANK_SCALE):
    return np.max(np.abs(ranks)) / np.arctanh(2 * max_train_rank_scale - 1)


@hookimpl
def get_classifiers():
    return {
        "sklearn:naive_bayes": NaiveBayes,
        "sklearn:random_forest": RandomForest,
        "sklearn:svm": SupportVectorMachine,
        "sklearn:gradient_boosted_tree": GradientBoostedTree,
        "sklearn:neural_network": NeuralNetwork,
        "sklearn:approx_kernel_sgd": ApproximateKernelSGD,
    }


@hookimpl
def get_incremental_classifiers():
    return {
        "sklearn:inc_naive_bayes": IncrementalNaiveBayes,
        "sklearn:passive_aggressive": PassiveAggressive,
        "sklearn:inc_neural_network": IncrementalNeuralNetwork,
        "sklearn:inc_approx_kernel_sgd": IncrementalApproximateKernelSGD,
    }