import logging
import timeit
import warnings
import numpy as np
from scipy.stats import randint, reciprocal, uniform
from ligo.segments import segmentlist
from sklearn import linear_model as sklearn_linear_model
from sklearn import naive_bayes as sklearn_naive_bayes
from sklearn import svm as sklearn_svm
from sklearn import neural_network as sklearn_neural_network
from sklearn import pipeline as sklearn_pipeline
from sklearn import model_selection as sklearn_model_selection
from sklearn import preprocessing as sklearn_preprocessing
from sklearn import kernel_approximation as sklearn_kernel_approximation
from sklearn.utils import class_weight
# catch weight boosting deprecation warning
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
from sklearn import ensemble as sklearn_ensemble
from .. import exceptions
from .. import hookimpl
from .. import features
from .. import io
from ..series import SeriesInfo
from . import ClassifierModel, SupervisedClassifier, IncrementalSupervisedClassifier
from . import DEFAULT_DT
logger = logging.getLogger("idq")
DEFAULT_NUM_CV_PROC = 1 # sets number of parallel jobs for CV
DEFAULT_CV_SCORING = "roc_auc" # sets scoring metric for CV
DEFAULT_NUM_CV_FOLDS = 5 # sets number of folds used for CV
DEFAULT_NUM_SAMPLES = 10 # sets number of parameters to sample for randomized CV
DEFAULT_MAX_TRAIN_RANK_SCALE = (
0.99 # sets max value of ranks for scaling found in training
)
DEFAULT_COLUMN_VALUE = 0
DEFAULT_WHITENER = "standard"
# -------------------------------------------------
# classifier implementations
[docs]class SklearnModel(ClassifierModel):
"""a base model class for all sklearn classifiers"""
def __init__(
self,
start,
end,
pipeline,
channels,
downselector,
transformer,
time=features.DEFAULT_TIME_NAME,
bounds=None,
segs=None,
model_id=None,
generate_id=False,
):
ClassifierModel.__init__(
self, start, end, segs=segs, model_id=model_id, generate_id=generate_id
)
# ensure the classifier returns probabilistic estimates or
# some mechanism for returning a continuous range
if not (
hasattr(pipeline.named_steps["classifier"], "predict_proba")
or hasattr(pipeline.named_steps["classifier"], "decision_function")
):
raise AttributeError(
"%s has no predict_proba or decision_function attribute'%self.flavor"
% self.flavor
)
self._scale_ranks = not hasattr(
pipeline.named_steps["classifier"], "predict_proba"
)
self._sklearn = pipeline
# set up how features are extracted and transformed
if not bounds:
bounds = {}
self._selector = features.Selector(
channels=channels,
time=time,
bounds=bounds,
downselector=downselector,
transformer=transformer,
)
@property
def sklearn(self):
return self._sklearn
@property
def channels(self):
return self._selector.channels
@property
def time(self):
return self._selector.time
@property
def selector(self):
return self._selector
@property
def hyperparameters(self):
return self._sklearn.get_params()
def fit(self, data, labels, **kwargs):
self.sklearn.fit(data, labels)
# if using CV, only grab estimator with best hyperparameters
if "search" in kwargs:
self._sklearn = self.sklearn.best_estimator_
# if rank scaling needs to be done, also train with output from
# classifier as well
if self._scale_ranks:
unscaled_ranks = self.sklearn.decision_function(data)
self._scaler = sklearn_preprocessing.FunctionTransformer(
func=_rank_scaler,
kw_args={"scale": _rank_scaler_scale(unscaled_ranks)},
)
self._scaler.fit(unscaled_ranks.reshape(-1, 1)) # reshape to 2d array
def ranks(self, data):
# decide how to evaluate model based on defined attributes
if self._scale_ranks:
unscaled_ranks = self.sklearn.decision_function(data)
ranks = self._scaler.transform(
unscaled_ranks.reshape(-1, 1)
) # reshape to 2d array
ranks = ranks[:, 0] # pull out the scaled ranks
else:
ranks = self.sklearn.predict_proba(data)
ranks = ranks[:, 1] # only pull out the "class-1" rank
return ranks
[docs]class SupervisedSklearnClassifier(SupervisedClassifier):
"""
a base class for supervised sk-learn classifiers, which
contains simple implementations for supervised classifier methods.
Note: not to be used as a standalone classifier.
"""
_flavor = "sklearn_supervised_classifier"
_required_kwargs = ["safe_channels_path", "window", "time", "significance"]
def __init__(self, *args, **kwargs):
SupervisedClassifier.__init__(self, *args, **kwargs)
[docs] def train(self, dataset):
"""
Trains a supervised scikit-learn model to feature data using a labeled
dataset.
"""
verbose = self.kwargs.get("verbose", False)
# sanity check training data
num_glitch, num_clean = dataset.vectors2num()
if num_glitch == 0 or num_clean == 0:
raise ValueError(
"training scikit-based classifiers not allowed with zero target"
"or clean (random) times. this may indicate an issue with data"
"discovery or not requesting enough time for training."
)
self.class_weights = class_weight.compute_class_weight(
"balanced", classes=np.array([0, 1]), y=dataset.labels
)
# create and train model
self.model = self._set_up_model(dataset, verbose=verbose)
start_time = timeit.default_timer()
logger.info("generating dataset")
# configure feature selection and load
if not dataset.is_configured():
dataset.configure(self.model.selector)
dataset.load_data(verbose=verbose)
elapsed = timeit.default_timer() - start_time
logger.debug("quiver generation took %.2f sec" % elapsed)
start_time = timeit.default_timer()
logger.info("training model")
self.model.fit(
dataset.features.as_unstructured(), dataset.labels, **self.kwargs
)
elapsed = timeit.default_timer() - start_time
logger.debug("model training took %.2f sec" % elapsed)
logger.debug("model hyperparameters:")
pipeline_steps = [
est.__class__.__name__ for _, est in self.model.hyperparameters["steps"]
]
pipeline_names = [name for name, _ in self.model.hyperparameters["steps"]]
for param, val in self.model.hyperparameters.items():
if param == "steps":
pipeline_str = "[" + ", ".join(pipeline_steps) + "]"
logger.debug(" pipeline: %s" % pipeline_str)
elif param not in (["memory"] + pipeline_names):
logger.debug(" {param}: {val}".format(param=param, val=val))
return self.model
[docs] def evaluate(self, dataset):
"""
Applies a supervised scikit-learn model to feature data. Takes in an unlabeled
dataset and produces ranks for each feature vector within that dataset.
"""
verbose = self.kwargs.get("verbose", False)
if not self.is_trained:
raise exceptions.UntrainedError(
"%s does not have an internal model" % self.flavor
)
start_time = timeit.default_timer()
logger.info("generating dataset")
# configure feature selection and load
if not dataset.is_configured():
dataset.configure(self.model.selector)
dataset.load_data(verbose=verbose)
elapsed = timeit.default_timer() - start_time
logger.debug("quiver generation took %.2f sec" % elapsed)
start_time = timeit.default_timer()
logger.info("evaluating model")
# evaluate dataset
ranks = self.model.ranks(dataset.features.as_unstructured())
dataset.evaluate(ranks, hashes=self.model.hash)
elapsed = timeit.default_timer() - start_time
logger.debug("model evaluation took %.2f sec" % elapsed)
return dataset
[docs] def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None):
"""
Generate a time series of predictions based on predicted model probabilities.
"""
# check if model has been trained
if not self.is_trained:
raise exceptions.UntrainedError(
"%s does not have an internal model" % self.flavor
)
if segs is None:
segs = dataset_factory.classifier_data.segs
ranks = []
for seg in segs:
dataset = dataset_factory.unlabeled(dt=dt, segs=segmentlist([seg]))
if len(dataset):
# append ranks if dataset isn't empty
ranks.append(
SeriesInfo.from_ranks(
info,
dataset.times[0],
dt,
self.evaluate(dataset).ranks,
self.model,
self.calibration_map,
set_ok=set_ok,
)
)
return ranks
[docs] def feature_importance(self):
"""
Retrieve feature importances from a classifier if it has such a method.
"""
# check if model has been trained
if not self.is_trained:
raise exceptions.UntrainedError(
"%s does not have an internal model" % self.flavor
)
# check if sklearn classifier either has feature_importances_ or coef_
# properties
importances = getattr(self.model.sklearn, "feature_importances_", None)
if not importances and hasattr(self.model.sklearn, "coef_"):
if self.model.sklearn.coef_.ndim == 1:
importances = np.abs(self.model.sklearn.coef_)
else:
importances = np.sum(np.abs(self.model.sklearn.coef_), axis=0)
elif not importances:
raise ValueError(
"feature_importance is not implemented for %s" % self.flavor
)
return importances
def _create_model(self, dataset, channels):
return SklearnModel(
dataset.start,
dataset.end,
pipeline=self._create_pipeline(**self.kwargs.get("params", {})),
channels=channels,
downselector=features.DownselectLoudest(**self.kwargs),
transformer=features.DeltaTimeTransformer(**self.kwargs),
bounds=self.kwargs["bounds"],
time=self.kwargs.get("time", features.DEFAULT_TIME_NAME),
segs=dataset.segs,
model_id=self._model_id,
)
def _set_up_model(self, dataset, verbose=False):
"""
Sets up a scikit-learn classifier model, using keyword arguments as necessary.
Used internally
"""
# set defaults for missing values if not set
self.kwargs["default"] = self.kwargs.get("default", DEFAULT_COLUMN_VALUE)
self.kwargs["default_delta_time"] = self.kwargs.get(
"default_delta_time", -self.kwargs["window"]
)
# set quantities used for calculating layer sizes
channels = io.path2channels(self.kwargs["safe_channels_path"])
self.num_channels = len(channels)
self.num_columns = len(dataset._dataloader.columns)
self.num_features = self.num_channels * self.num_columns
# create the model
model = self._create_model(dataset, channels)
logger.debug("model parameters:")
pipeline_steps = [
est.__class__.__name__ for _, est in model.sklearn.get_params()["steps"]
]
pipeline_names = [name for name, _ in model.sklearn.get_params()["steps"]]
for param, val in model.sklearn.get_params().items():
if param == "steps":
pipeline_str = "[" + ", ".join(pipeline_steps) + "]"
logger.info("model pipeline: %s" % pipeline_str)
elif param not in (["memory"] + pipeline_names):
logger.debug(" {param}: {val}".format(param=param, val=val))
# use cross-validation if specified
if "search" in self.kwargs and self.kwargs["search"]["type"] == "grid":
# create hyperparameter grid given ranges specified
param_grid = {}
for param, conf in self.kwargs["search"]["params"].items():
lo, hi = conf["range"]
num_samp = conf["num_samples"]
dist = conf["type"]
type_ = "discrete" if conf["discrete"] else "continuous"
param_grid[param] = self._generate_grid(lo, hi, num_samp, dist, type_)
logger.info("grid-based hyperparameter search over:")
for hyperparam, val in param_grid.items():
logger.info(
" {param}: {val}".format(param=hyperparam, val=repr(val))
)
logger.info(
" for %d total searches"
% np.prod([len(val) for val in param_grid.values()])
)
model._sklearn = sklearn_model_selection.GridSearchCV(
model._sklearn,
n_jobs=self.kwargs.get("num_cv_proc", DEFAULT_NUM_CV_PROC),
cv=self.kwargs.get("num_cv_folds", DEFAULT_NUM_CV_FOLDS),
scoring=self.kwargs.get("cv_scoring", DEFAULT_CV_SCORING),
param_grid=param_grid,
refit=True,
verbose=int(verbose),
)
if "search" in self.kwargs and self.kwargs["search"]["type"] == "random":
# create distributions for random sampling
num_samples = self.kwargs["search"].get("num_samples", DEFAULT_NUM_SAMPLES)
param_dist = {}
for param, conf in self.kwargs["search"]["params"].items():
lo, hi = conf["range"]
dist = conf["type"]
type_ = "discrete" if conf["discrete"] else "continuous"
param_dist[param] = self._generate_distribution(lo, hi, dist, type_)
logger.info("random sampling-based hyperparameter search over:")
for hyperparam, val in param_dist.items():
logger.info(
" {param}: {val}".format(param=hyperparam, val=val.dist.name)
)
logger.info(" for %d total searches" % num_samples)
model._sklearn = sklearn_model_selection.RandomizedSearchCV(
model._sklearn,
n_iter=num_samples,
n_jobs=self.kwargs.get("num_cv_proc", DEFAULT_NUM_CV_PROC),
cv=self.kwargs.get("num_cv_folds", DEFAULT_NUM_CV_FOLDS),
scoring=self.kwargs.get("cv_scoring", DEFAULT_CV_SCORING),
random_state=self.kwargs.get("random_state", None),
param_distributions=param_dist,
refit=True,
verbose=int(verbose),
)
return model
def _create_pipeline(self, **kwargs):
# build all parts of pipeline needed in the correct order
pipeparts = self.preprocessor()
classifier = self.classifier()
pipeparts.extend(classifier)
# filter kwargs to pass in specific kwargs only
classifier_kwargs = {}
for name, _ in classifier:
estimator_kwargs = {
kwarg: value
for kwarg, value in kwargs.items()
if "%s__" % name in kwarg
}
classifier_kwargs.update(estimator_kwargs)
# build pipeline, set parameters
pipe = sklearn_pipeline.Pipeline(pipeparts)
pipe.set_params(**classifier_kwargs)
# set global random state if one is provided
if "random_state" in kwargs:
random_state_params = {
kwarg: kwargs["random_state"]
for kwarg, _ in pipe.get_params().items()
if "random_state" in kwarg
}
pipe.set_params(**random_state_params)
return pipe
def classifier(self):
return [("classifier", sklearn_linear_model.Perceptron())]
def preprocessor(self):
preprocessor = []
whitener = self.kwargs.get("whitener", DEFAULT_WHITENER)
if whitener == "robust":
preprocessor.append(("whitener", sklearn_preprocessing.RobustScaler()))
elif whitener == "standard":
preprocessor.append(("whitener", sklearn_preprocessing.StandardScaler()))
else:
raise ValueError(
"%s is not an available whitener option" % self.kwargs["whitener"]
)
return preprocessor
def _generate_distribution(self, param_min, param_max, distribution, dist_type):
"""
Used internally to generate a distribution based on hyperparameter
bounds and a distribution suitable for use in randomized search
cross-validation schemes.
"""
if dist_type == "continuous":
if distribution == "log_uniform":
return reciprocal(a=param_min, b=param_max)
elif distribution == "uniform":
return uniform(loc=param_min, scale=(param_max - param_min))
else:
raise NotImplementedError
elif dist_type == "discrete":
if distribution == "uniform":
return randint(param_min, param_max)
else:
raise NotImplementedError
else:
raise ValueError("valid distribution types are 'continuous' or 'discrete'")
def _generate_grid(
self, param_min, param_max, num_samples, distribution, dist_type
):
"""
Used internally to generate a grid based on hyperparameter bounds and a
number of grid points suitable for use in grid search cross-validation
schemes.
"""
if dist_type == "continuous":
if distribution == "log_uniform":
return np.logspace(
np.log10(param_min), np.log10(param_max), num_samples
)
elif distribution == "uniform":
return np.linspace(param_min, param_max, num_samples)
else:
raise NotImplementedError
elif dist_type == "discrete":
if distribution == "uniform":
start = param_min
end = param_max + 1 # endpoint is inclusive in arange
# calculate approximate step size based on num_samples passed in,
# rounds to the nearest integer step size but with a min step size of 1
max_num_samples = min(num_samples, end - start)
step = int(np.round(float(end - start) / max_num_samples))
return np.arange(start, end, step=step, dtype=int)
else:
raise NotImplementedError
else:
raise ValueError("valid distribution types are 'continuous' or 'discrete'")
[docs]class IncrementalSupervisedSklearnClassifier(
IncrementalSupervisedClassifier, SupervisedSklearnClassifier
):
"""
a base class for incremental supervised sk-learn classifiers, which contains
simple implementations for incremental supervised classifier methods.
Note: not to be used as a standalone classifier.
"""
_flavor = "sklearn_incremental_supervised_classifier"
_required_kwargs = ["safe_channels_path", "window", "time", "significance"]
[docs] def train(self, dataset):
"""
Trains a supervised scikit-learn model to feature data using a labeled dataset.
"""
# create and train model
firsttime = (
not self.is_trained
) # whether this is the first time this has been called
if firsttime:
self.model = self._set_up_model(dataset)
else:
self.model.start = min(self.model.start, dataset.start)
self.model.end = max(self.model.end, dataset.end)
self.model.segs |= dataset.segs
data = dataset.features(
self.model.channels,
self.model.select,
time=self.model.time,
)
self.model.sklearn.partial_fit(
data.to_unstructured(),
dataset.labels,
classes=[0.0, 1.0],
)
return self.model
[docs] def evaluate(self, dataset):
"""
Applies a supervised sk-learn model to feature data.
"""
return SupervisedSklearnClassifier.evaluate(dataset)
[docs] def timeseries(self, *args, **kwargs):
"""
Generate a time series of predictions based on predicted model
probabilities
"""
return SupervisedSklearnClassifier.timeseries(*args, **kwargs)
[docs] def feature_importance(self):
"""
Estimates the likelihood ratio of each feature over the training data
set bootstrap/importance sample to estimate the average importance
"""
return SupervisedSklearnClassifier.feature_importance()
def _create_pipeline(self, **kwargs):
return SupervisedSklearnClassifier._create_pipeline()
[docs]class NaiveBayes(SupervisedSklearnClassifier):
"""
A Naive Bayes classifier based on scikit-learn.
This is a supervised learning algorithm which assumes independence between
all features, and uses Bayes' theorem to determine the posterior probability
that a set of features is in a given class. In this particular
implementation, the likelihood of features are Gaussian in form.
* `Gaussian Naive Bayes User's Guide
<http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes>`_.
* `Gaussian Naive Bayes API
<http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>`_.
"""
_flavor = "naive_bayes"
[docs] def classifier(self):
"""
Create a Gaussian Naive Bayes classifer
"""
return [("classifier", sklearn_naive_bayes.GaussianNB())]
[docs]class RandomForest(SupervisedSklearnClassifier):
"""
A Random Forest of Decision Trees based on scikit-learn.
This is a supervised learning algorithm which uses a group of randomized
decision trees (a forest) to perform classification.
* `Random Forest User's Guide
<http://scikit-learn.org/stable/modules/ensemble.html#forest>`_.
* `Random Forest API
<http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier>`_.
"""
_flavor = "random_forest"
[docs] def classifier(self):
"""
Create a random forest classifier
"""
return [("classifier", sklearn_ensemble.RandomForestClassifier())]
[docs]class SupportVectorMachine(SupervisedSklearnClassifier):
"""
A support vector machine based on scikit-learn.
This is a supervised learning algorithm which uses a hyperplane to separate
data points into two distinct classes. It also allows for kernel-based
learning, so that if samples aren't appropriate to be separated by a
hyperplane, samples gets transformed via a kernel to a higher-dimensional
space where samples can be separated in a linear fashion.
Various kernels are supported and can be passed in by passing in the kernel
kwarg to the classifier configuration section.
NOTE: The scikit-learn classifier, SVC, is used to perform classification.
Probability is set to true so that the mapping between rank to a calibrated
probability can be performed more easily.
* `SVM User's Guide <scikit-learn.org/stable/modules/svm.html>`_.
* `SVM API
<http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC>`_.
"""
_flavor = "support_vector_machine"
[docs] def classifier(self):
"""
Creates a support vector machine classifier
"""
return [("classifier", sklearn_svm.SVC(probability=True))]
[docs]class GradientBoostedTree(SupervisedSklearnClassifier):
"""
A Gradient Tree Boosting algorithm based on scikit-learn.
This is a supervised learning algorithm which produces an ensemble of
decision trees, builds them up in a stage-wise fashion, and allows use of
arbitrary differentiable loss functions.
* `GBT User's Guide
<http://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting>`_.
* `GBT API
<http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier>`_.
"""
_flavor = "gradient_boosted_tree"
[docs] def classifier(self):
"""
Creates a gradient boosted tree classifier.
"""
return [("classifier", sklearn_ensemble.GradientBoostingClassifier())]
[docs]class NeuralNetwork(SupervisedSklearnClassifier):
"""
A neural network (multi-layer perception) algorithm based on scikit-learn.
This is a supervised learning algorithm which produces a shallow neural
network of multiple layers with a choice of activation function for the
hidden layers. It trains itself using backpropagation.
* `MultiLayer Perceptron User's Guide
<http://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron>`_.
* `MultiLayer Perceptron API
<http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier>`_.
"""
_flavor = "neural_network"
[docs] def classifier(self):
"""
Creates a multilayer perceptron classifier.
"""
return [("classifier", sklearn_neural_network.MLPClassifier())]
[docs]class ApproximateKernelSGD(SupervisedSklearnClassifier):
"""
A Stochastic Gradient Descent classifier based on scikit-learn, with a
choice of an approximate kernel to transform nonlinear features into linear
features suitable for the SDG classifier.
Guide for using the Stochastic Gradient Descent classifier:
* `SGD User's Guide
<http://scikit-learn.org/stable/modules/sgd.html#stochastic-gradient-descent>`_.
* `SGD API
<http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn-linear-model-sgdclassifier>`_.
Guide for the approximate kernel algorithm (using the Nystroem method),
types of kernels and appropriate parameters:
* `Kernel Approximation User's Guide
<http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_.
* `Kernel Approximation API
<http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_.
"""
_flavor = "approximate_kernel_SGD"
[docs] def classifier(self):
"""
Creates an approximate kernel SGD classifier.
"""
kernel = ("kernel", sklearn_kernel_approximation.Nystroem())
classifier = ("classifier", sklearn_linear_model.SGDClassifier())
return [kernel, classifier]
[docs]class ApproximateKernelSVM(SupervisedSklearnClassifier):
"""
A linear SVM based on scikit-learn, with a choice of an approximate kernel
to transform nonlinear features into linear features suitable for the SVM
classifier.
Guide for using the linear SVM classifier:
* `SVM User's Guide
<http://scikit-learn.org/stable/modules/svm.html#support-vector-machines>`_.
* `Linear SVM API
<http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC>`_.
Guide for the approximate kernel algorithm (using the Nystroem method),
types of kernels and appropriate parameters:
* `Kernel Approximation User's Guide
<http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_.
* `Kernel Approximation API
<http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_.
"""
_flavor = "approximate_kernel_SVM"
[docs] def classifier(self):
"""
Creates a approximate kernel SVM classifier.
"""
kernel = ("kernel", sklearn_kernel_approximation.Nystroem())
classifier = ("classifier", sklearn_svm.LinearSVC())
return [kernel, classifier]
[docs]class IncrementalNaiveBayes(IncrementalSupervisedSklearnClassifier):
"""
A Naive Bayes classifier based on scikit-learn. Trains in an incremental
fashion.
This is a supervised learning algorithm which assumes independence between
all features, and uses Bayes' theorem to determine the posterior probability
that a set of features is in a given class. In this particular
implementation, the likelihood of features are Gaussian in form.
* `Gaussian Naive Bayes User's Guide
<http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes>`_.
* `Gaussian Naive Bayes API
<http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>`_.
"""
_flavor = "incremental_naive_bayes"
[docs] def classifier(self):
"""
Creates a Gaussian Naive Bayes classifier.
"""
return [("classifier", sklearn_naive_bayes.GaussianNB())]
[docs]class PassiveAggressive(IncrementalSupervisedSklearnClassifier):
"""
A Passive-Aggressive classifier based on scikit-learn. Trains in an
incremental fashion.
Based off of
<http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>`_.
* `Passive-Aggressive User's Guide
<http://scikit-learn.org/stable/modules/linear_model.html#passive-aggressive>`_.
* `Passive-Aggressive API
<http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn-linear-model-passiveaggressiveclassifier>`_.
"""
_flavor = "passive_aggressive"
[docs] def classifier(self):
"""
Creates a Passive-Aggressive classifier.
"""
return [("classifier", sklearn_linear_model.PassiveAggressive())]
[docs]class IncrementalNeuralNetwork(IncrementalSupervisedSklearnClassifier):
"""
A Multilayer Perception (neural network) algorithm based on scikit-learn.
Trains in an incremental fashion.
This is a supervised learning algorithm which produces a shallow neural
network of multiple layers with a choice of activation function for the
hidden layers. It trains itself using backpropagation.
* `MultiLayer Perceptron User's Guide
<http://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron>`_.
* `MultiLayer Perceptron API
<http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier>`_.
"""
_flavor = "incremental_multilayer_perceptron"
[docs] def classifier(self):
"""
Creates a multilayer perceptron classifier.
"""
return [("classifier", sklearn_neural_network.MLPClassifier())]
[docs]class IncrementalApproximateKernelSGD(IncrementalSupervisedSklearnClassifier):
"""
A Stochastic Gradient Descent classifier based on scikit-learn, with a
choice of an approximate kernel to transform nonlinear features into linear
features suitable for the SDG classifier. Trains in an incremental fashion.
Guide for using the Stochastic Gradient Descent classifier:
* `SGD User's Guide
<http://scikit-learn.org/stable/modules/sgd.html#stochastic-gradient-descent>`_.
* `SGD API
<http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn-linear-model-sgdclassifier>`_.
Guide for the approximate kernel algorithm (using the Nystroem method),
types of kernels and appropriate parameters:
* `Kernel Approximation User's Guide
<http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_.
* `Kernel Approximation API
<http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_.
"""
_flavor = "incremental_approximate_kernel_SGD"
[docs] def classifier(self):
"""
Creates a approximate kernel SGD classifier.
"""
kernel = ("kernel", sklearn_kernel_approximation.Nystroem())
classifier = ("classifier", sklearn_linear_model.SGDClassifier())
return [kernel, classifier]
# -------------------------------------------
# utilities for rank scaling
def _rank_scaler(ranks, scale=1):
return 0.5 * (np.tanh(ranks / scale) + 1)
def _rank_scaler_scale(ranks, max_train_rank_scale=DEFAULT_MAX_TRAIN_RANK_SCALE):
return np.max(np.abs(ranks)) / np.arctanh(2 * max_train_rank_scale - 1)
@hookimpl
def get_classifiers():
return {
"sklearn:naive_bayes": NaiveBayes,
"sklearn:random_forest": RandomForest,
"sklearn:svm": SupportVectorMachine,
"sklearn:gradient_boosted_tree": GradientBoostedTree,
"sklearn:neural_network": NeuralNetwork,
"sklearn:approx_kernel_sgd": ApproximateKernelSGD,
}
@hookimpl
def get_incremental_classifiers():
return {
"sklearn:inc_naive_bayes": IncrementalNaiveBayes,
"sklearn:passive_aggressive": PassiveAggressive,
"sklearn:inc_neural_network": IncrementalNeuralNetwork,
"sklearn:inc_approx_kernel_sgd": IncrementalApproximateKernelSGD,
}