Source code for idq.classifiers.sklearn

import logging
import timeit
import warnings

import numpy as np
from scipy.stats import randint, reciprocal, uniform

from ligo.segments import segmentlist

from sklearn import linear_model as sklearn_linear_model
from sklearn import naive_bayes as sklearn_naive_bayes
from sklearn import svm as sklearn_svm
from sklearn import neural_network as sklearn_neural_network
from sklearn import pipeline as sklearn_pipeline
from sklearn import model_selection as sklearn_model_selection
from sklearn import preprocessing as sklearn_preprocessing
from sklearn import kernel_approximation as sklearn_kernel_approximation
from sklearn.utils import class_weight

# catch weight boosting deprecation warning
with warnings.catch_warnings():
    warnings.simplefilter("ignore", DeprecationWarning)
    from sklearn import ensemble as sklearn_ensemble

from .. import exceptions
from .. import hookimpl
from .. import features
from .. import io
from ..series import SeriesInfo

from . import ClassifierModel, SupervisedClassifier, IncrementalSupervisedClassifier
from . import DEFAULT_DT


logger = logging.getLogger("idq")


DEFAULT_NUM_CV_PROC = 1  # sets number of parallel jobs for CV
DEFAULT_CV_SCORING = "roc_auc"  # sets scoring metric for CV
DEFAULT_NUM_CV_FOLDS = 5  # sets number of folds used for CV
DEFAULT_NUM_SAMPLES = 10  # sets number of parameters to sample for randomized CV
DEFAULT_MAX_TRAIN_RANK_SCALE = (
    0.99  # sets max value of ranks for scaling found in training
)
DEFAULT_COLUMN_VALUE = 0
DEFAULT_WHITENER = "standard"

# -------------------------------------------------
# classifier implementations


[docs]class SklearnModel(ClassifierModel): """a base model class for all sklearn classifiers""" def __init__( self, start, end, pipeline, channels, downselector, transformer, time=features.DEFAULT_TIME_NAME, bounds=None, segs=None, model_id=None, generate_id=False, ): ClassifierModel.__init__( self, start, end, segs=segs, model_id=model_id, generate_id=generate_id ) # ensure the classifier returns probabilistic estimates or # some mechanism for returning a continuous range if not ( hasattr(pipeline.named_steps["classifier"], "predict_proba") or hasattr(pipeline.named_steps["classifier"], "decision_function") ): raise AttributeError( "%s has no predict_proba or decision_function attribute'%self.flavor" % self.flavor ) self._scale_ranks = not hasattr( pipeline.named_steps["classifier"], "predict_proba" ) self._sklearn = pipeline # set up how features are extracted and transformed if not bounds: bounds = {} self._selector = features.Selector( channels=channels, time=time, bounds=bounds, downselector=downselector, transformer=transformer, ) @property def sklearn(self): return self._sklearn @property def channels(self): return self._selector.channels @property def time(self): return self._selector.time @property def selector(self): return self._selector @property def hyperparameters(self): return self._sklearn.get_params() def fit(self, data, labels, **kwargs): self.sklearn.fit(data, labels) # if using CV, only grab estimator with best hyperparameters if "search" in kwargs: self._sklearn = self.sklearn.best_estimator_ # if rank scaling needs to be done, also train with output from # classifier as well if self._scale_ranks: unscaled_ranks = self.sklearn.decision_function(data) self._scaler = sklearn_preprocessing.FunctionTransformer( func=_rank_scaler, kw_args={"scale": _rank_scaler_scale(unscaled_ranks)}, ) self._scaler.fit(unscaled_ranks.reshape(-1, 1)) # reshape to 2d array def ranks(self, data): # decide how to evaluate model based on defined attributes if self._scale_ranks: unscaled_ranks = self.sklearn.decision_function(data) ranks = self._scaler.transform( unscaled_ranks.reshape(-1, 1) ) # reshape to 2d array ranks = ranks[:, 0] # pull out the scaled ranks else: ranks = self.sklearn.predict_proba(data) ranks = ranks[:, 1] # only pull out the "class-1" rank return ranks
[docs]class SupervisedSklearnClassifier(SupervisedClassifier): """ a base class for supervised sk-learn classifiers, which contains simple implementations for supervised classifier methods. Note: not to be used as a standalone classifier. """ _flavor = "sklearn_supervised_classifier" _required_kwargs = ["safe_channels_path", "window", "time", "significance"] def __init__(self, *args, **kwargs): SupervisedClassifier.__init__(self, *args, **kwargs)
[docs] def train(self, dataset): """ Trains a supervised scikit-learn model to feature data using a labeled dataset. """ verbose = self.kwargs.get("verbose", False) # sanity check training data num_glitch, num_clean = dataset.vectors2num() if num_glitch == 0 or num_clean == 0: raise ValueError( "training scikit-based classifiers not allowed with zero target" "or clean (random) times. this may indicate an issue with data" "discovery or not requesting enough time for training." ) self.class_weights = class_weight.compute_class_weight( "balanced", classes=np.array([0, 1]), y=dataset.labels ) # create and train model self.model = self._set_up_model(dataset, verbose=verbose) start_time = timeit.default_timer() logger.info("generating dataset") # configure feature selection and load if not dataset.is_configured(): dataset.configure(self.model.selector) dataset.load_data(verbose=verbose) elapsed = timeit.default_timer() - start_time logger.debug("quiver generation took %.2f sec" % elapsed) start_time = timeit.default_timer() logger.info("training model") self.model.fit( dataset.features.as_unstructured(), dataset.labels, **self.kwargs ) elapsed = timeit.default_timer() - start_time logger.debug("model training took %.2f sec" % elapsed) logger.debug("model hyperparameters:") pipeline_steps = [ est.__class__.__name__ for _, est in self.model.hyperparameters["steps"] ] pipeline_names = [name for name, _ in self.model.hyperparameters["steps"]] for param, val in self.model.hyperparameters.items(): if param == "steps": pipeline_str = "[" + ", ".join(pipeline_steps) + "]" logger.debug(" pipeline: %s" % pipeline_str) elif param not in (["memory"] + pipeline_names): logger.debug(" {param}: {val}".format(param=param, val=val)) return self.model
[docs] def evaluate(self, dataset): """ Applies a supervised scikit-learn model to feature data. Takes in an unlabeled dataset and produces ranks for each feature vector within that dataset. """ verbose = self.kwargs.get("verbose", False) if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) start_time = timeit.default_timer() logger.info("generating dataset") # configure feature selection and load if not dataset.is_configured(): dataset.configure(self.model.selector) dataset.load_data(verbose=verbose) elapsed = timeit.default_timer() - start_time logger.debug("quiver generation took %.2f sec" % elapsed) start_time = timeit.default_timer() logger.info("evaluating model") # evaluate dataset ranks = self.model.ranks(dataset.features.as_unstructured()) dataset.evaluate(ranks, hashes=self.model.hash) elapsed = timeit.default_timer() - start_time logger.debug("model evaluation took %.2f sec" % elapsed) return dataset
[docs] def timeseries(self, info, dataset_factory, dt=DEFAULT_DT, segs=None, set_ok=None): """ Generate a time series of predictions based on predicted model probabilities. """ # check if model has been trained if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) if segs is None: segs = dataset_factory.classifier_data.segs ranks = [] for seg in segs: dataset = dataset_factory.unlabeled(dt=dt, segs=segmentlist([seg])) if len(dataset): # append ranks if dataset isn't empty ranks.append( SeriesInfo.from_ranks( info, dataset.times[0], dt, self.evaluate(dataset).ranks, self.model, self.calibration_map, set_ok=set_ok, ) ) return ranks
[docs] def feature_importance(self): """ Retrieve feature importances from a classifier if it has such a method. """ # check if model has been trained if not self.is_trained: raise exceptions.UntrainedError( "%s does not have an internal model" % self.flavor ) # check if sklearn classifier either has feature_importances_ or coef_ # properties importances = getattr(self.model.sklearn, "feature_importances_", None) if not importances and hasattr(self.model.sklearn, "coef_"): if self.model.sklearn.coef_.ndim == 1: importances = np.abs(self.model.sklearn.coef_) else: importances = np.sum(np.abs(self.model.sklearn.coef_), axis=0) elif not importances: raise ValueError( "feature_importance is not implemented for %s" % self.flavor ) return importances
def _create_model(self, dataset, channels): return SklearnModel( dataset.start, dataset.end, pipeline=self._create_pipeline(**self.kwargs.get("params", {})), channels=channels, downselector=features.DownselectLoudest(**self.kwargs), transformer=features.DeltaTimeTransformer(**self.kwargs), bounds=self.kwargs["bounds"], time=self.kwargs.get("time", features.DEFAULT_TIME_NAME), segs=dataset.segs, model_id=self._model_id, ) def _set_up_model(self, dataset, verbose=False): """ Sets up a scikit-learn classifier model, using keyword arguments as necessary. Used internally """ # set defaults for missing values if not set self.kwargs["default"] = self.kwargs.get("default", DEFAULT_COLUMN_VALUE) self.kwargs["default_delta_time"] = self.kwargs.get( "default_delta_time", -self.kwargs["window"] ) # set quantities used for calculating layer sizes channels = io.path2channels(self.kwargs["safe_channels_path"]) self.num_channels = len(channels) self.num_columns = len(dataset._dataloader.columns) self.num_features = self.num_channels * self.num_columns # create the model model = self._create_model(dataset, channels) logger.debug("model parameters:") pipeline_steps = [ est.__class__.__name__ for _, est in model.sklearn.get_params()["steps"] ] pipeline_names = [name for name, _ in model.sklearn.get_params()["steps"]] for param, val in model.sklearn.get_params().items(): if param == "steps": pipeline_str = "[" + ", ".join(pipeline_steps) + "]" logger.info("model pipeline: %s" % pipeline_str) elif param not in (["memory"] + pipeline_names): logger.debug(" {param}: {val}".format(param=param, val=val)) # use cross-validation if specified if "search" in self.kwargs and self.kwargs["search"]["type"] == "grid": # create hyperparameter grid given ranges specified param_grid = {} for param, conf in self.kwargs["search"]["params"].items(): lo, hi = conf["range"] num_samp = conf["num_samples"] dist = conf["type"] type_ = "discrete" if conf["discrete"] else "continuous" param_grid[param] = self._generate_grid(lo, hi, num_samp, dist, type_) logger.info("grid-based hyperparameter search over:") for hyperparam, val in param_grid.items(): logger.info( " {param}: {val}".format(param=hyperparam, val=repr(val)) ) logger.info( " for %d total searches" % np.prod([len(val) for val in param_grid.values()]) ) model._sklearn = sklearn_model_selection.GridSearchCV( model._sklearn, n_jobs=self.kwargs.get("num_cv_proc", DEFAULT_NUM_CV_PROC), cv=self.kwargs.get("num_cv_folds", DEFAULT_NUM_CV_FOLDS), scoring=self.kwargs.get("cv_scoring", DEFAULT_CV_SCORING), param_grid=param_grid, refit=True, verbose=int(verbose), ) if "search" in self.kwargs and self.kwargs["search"]["type"] == "random": # create distributions for random sampling num_samples = self.kwargs["search"].get("num_samples", DEFAULT_NUM_SAMPLES) param_dist = {} for param, conf in self.kwargs["search"]["params"].items(): lo, hi = conf["range"] dist = conf["type"] type_ = "discrete" if conf["discrete"] else "continuous" param_dist[param] = self._generate_distribution(lo, hi, dist, type_) logger.info("random sampling-based hyperparameter search over:") for hyperparam, val in param_dist.items(): logger.info( " {param}: {val}".format(param=hyperparam, val=val.dist.name) ) logger.info(" for %d total searches" % num_samples) model._sklearn = sklearn_model_selection.RandomizedSearchCV( model._sklearn, n_iter=num_samples, n_jobs=self.kwargs.get("num_cv_proc", DEFAULT_NUM_CV_PROC), cv=self.kwargs.get("num_cv_folds", DEFAULT_NUM_CV_FOLDS), scoring=self.kwargs.get("cv_scoring", DEFAULT_CV_SCORING), random_state=self.kwargs.get("random_state", None), param_distributions=param_dist, refit=True, verbose=int(verbose), ) return model def _create_pipeline(self, **kwargs): # build all parts of pipeline needed in the correct order pipeparts = self.preprocessor() classifier = self.classifier() pipeparts.extend(classifier) # filter kwargs to pass in specific kwargs only classifier_kwargs = {} for name, _ in classifier: estimator_kwargs = { kwarg: value for kwarg, value in kwargs.items() if "%s__" % name in kwarg } classifier_kwargs.update(estimator_kwargs) # build pipeline, set parameters pipe = sklearn_pipeline.Pipeline(pipeparts) pipe.set_params(**classifier_kwargs) # set global random state if one is provided if "random_state" in kwargs: random_state_params = { kwarg: kwargs["random_state"] for kwarg, _ in pipe.get_params().items() if "random_state" in kwarg } pipe.set_params(**random_state_params) return pipe def classifier(self): return [("classifier", sklearn_linear_model.Perceptron())] def preprocessor(self): preprocessor = [] whitener = self.kwargs.get("whitener", DEFAULT_WHITENER) if whitener == "robust": preprocessor.append(("whitener", sklearn_preprocessing.RobustScaler())) elif whitener == "standard": preprocessor.append(("whitener", sklearn_preprocessing.StandardScaler())) else: raise ValueError( "%s is not an available whitener option" % self.kwargs["whitener"] ) return preprocessor def _generate_distribution(self, param_min, param_max, distribution, dist_type): """ Used internally to generate a distribution based on hyperparameter bounds and a distribution suitable for use in randomized search cross-validation schemes. """ if dist_type == "continuous": if distribution == "log_uniform": return reciprocal(a=param_min, b=param_max) elif distribution == "uniform": return uniform(loc=param_min, scale=(param_max - param_min)) else: raise NotImplementedError elif dist_type == "discrete": if distribution == "uniform": return randint(param_min, param_max) else: raise NotImplementedError else: raise ValueError("valid distribution types are 'continuous' or 'discrete'") def _generate_grid( self, param_min, param_max, num_samples, distribution, dist_type ): """ Used internally to generate a grid based on hyperparameter bounds and a number of grid points suitable for use in grid search cross-validation schemes. """ if dist_type == "continuous": if distribution == "log_uniform": return np.logspace( np.log10(param_min), np.log10(param_max), num_samples ) elif distribution == "uniform": return np.linspace(param_min, param_max, num_samples) else: raise NotImplementedError elif dist_type == "discrete": if distribution == "uniform": start = param_min end = param_max + 1 # endpoint is inclusive in arange # calculate approximate step size based on num_samples passed in, # rounds to the nearest integer step size but with a min step size of 1 max_num_samples = min(num_samples, end - start) step = int(np.round(float(end - start) / max_num_samples)) return np.arange(start, end, step=step, dtype=int) else: raise NotImplementedError else: raise ValueError("valid distribution types are 'continuous' or 'discrete'")
[docs]class IncrementalSupervisedSklearnClassifier( IncrementalSupervisedClassifier, SupervisedSklearnClassifier ): """ a base class for incremental supervised sk-learn classifiers, which contains simple implementations for incremental supervised classifier methods. Note: not to be used as a standalone classifier. """ _flavor = "sklearn_incremental_supervised_classifier" _required_kwargs = ["safe_channels_path", "window", "time", "significance"]
[docs] def train(self, dataset): """ Trains a supervised scikit-learn model to feature data using a labeled dataset. """ # create and train model firsttime = ( not self.is_trained ) # whether this is the first time this has been called if firsttime: self.model = self._set_up_model(dataset) else: self.model.start = min(self.model.start, dataset.start) self.model.end = max(self.model.end, dataset.end) self.model.segs |= dataset.segs data = dataset.features( self.model.channels, self.model.select, time=self.model.time, ) self.model.sklearn.partial_fit( data.to_unstructured(), dataset.labels, classes=[0.0, 1.0], ) return self.model
[docs] def evaluate(self, dataset): """ Applies a supervised sk-learn model to feature data. """ return SupervisedSklearnClassifier.evaluate(dataset)
[docs] def timeseries(self, *args, **kwargs): """ Generate a time series of predictions based on predicted model probabilities """ return SupervisedSklearnClassifier.timeseries(*args, **kwargs)
[docs] def feature_importance(self): """ Estimates the likelihood ratio of each feature over the training data set bootstrap/importance sample to estimate the average importance """ return SupervisedSklearnClassifier.feature_importance()
def _create_pipeline(self, **kwargs): return SupervisedSklearnClassifier._create_pipeline()
[docs]class NaiveBayes(SupervisedSklearnClassifier): """ A Naive Bayes classifier based on scikit-learn. This is a supervised learning algorithm which assumes independence between all features, and uses Bayes' theorem to determine the posterior probability that a set of features is in a given class. In this particular implementation, the likelihood of features are Gaussian in form. * `Gaussian Naive Bayes User's Guide <http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes>`_. * `Gaussian Naive Bayes API <http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>`_. """ _flavor = "naive_bayes"
[docs] def classifier(self): """ Create a Gaussian Naive Bayes classifer """ return [("classifier", sklearn_naive_bayes.GaussianNB())]
[docs]class RandomForest(SupervisedSklearnClassifier): """ A Random Forest of Decision Trees based on scikit-learn. This is a supervised learning algorithm which uses a group of randomized decision trees (a forest) to perform classification. * `Random Forest User's Guide <http://scikit-learn.org/stable/modules/ensemble.html#forest>`_. * `Random Forest API <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier>`_. """ _flavor = "random_forest"
[docs] def classifier(self): """ Create a random forest classifier """ return [("classifier", sklearn_ensemble.RandomForestClassifier())]
[docs]class SupportVectorMachine(SupervisedSklearnClassifier): """ A support vector machine based on scikit-learn. This is a supervised learning algorithm which uses a hyperplane to separate data points into two distinct classes. It also allows for kernel-based learning, so that if samples aren't appropriate to be separated by a hyperplane, samples gets transformed via a kernel to a higher-dimensional space where samples can be separated in a linear fashion. Various kernels are supported and can be passed in by passing in the kernel kwarg to the classifier configuration section. NOTE: The scikit-learn classifier, SVC, is used to perform classification. Probability is set to true so that the mapping between rank to a calibrated probability can be performed more easily. * `SVM User's Guide <scikit-learn.org/stable/modules/svm.html>`_. * `SVM API <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC>`_. """ _flavor = "support_vector_machine"
[docs] def classifier(self): """ Creates a support vector machine classifier """ return [("classifier", sklearn_svm.SVC(probability=True))]
[docs]class GradientBoostedTree(SupervisedSklearnClassifier): """ A Gradient Tree Boosting algorithm based on scikit-learn. This is a supervised learning algorithm which produces an ensemble of decision trees, builds them up in a stage-wise fashion, and allows use of arbitrary differentiable loss functions. * `GBT User's Guide <http://scikit-learn.org/stable/modules/ensemble.html#gradient-boosting>`_. * `GBT API <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier>`_. """ _flavor = "gradient_boosted_tree"
[docs] def classifier(self): """ Creates a gradient boosted tree classifier. """ return [("classifier", sklearn_ensemble.GradientBoostingClassifier())]
[docs]class NeuralNetwork(SupervisedSklearnClassifier): """ A neural network (multi-layer perception) algorithm based on scikit-learn. This is a supervised learning algorithm which produces a shallow neural network of multiple layers with a choice of activation function for the hidden layers. It trains itself using backpropagation. * `MultiLayer Perceptron User's Guide <http://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron>`_. * `MultiLayer Perceptron API <http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier>`_. """ _flavor = "neural_network"
[docs] def classifier(self): """ Creates a multilayer perceptron classifier. """ return [("classifier", sklearn_neural_network.MLPClassifier())]
[docs]class ApproximateKernelSGD(SupervisedSklearnClassifier): """ A Stochastic Gradient Descent classifier based on scikit-learn, with a choice of an approximate kernel to transform nonlinear features into linear features suitable for the SDG classifier. Guide for using the Stochastic Gradient Descent classifier: * `SGD User's Guide <http://scikit-learn.org/stable/modules/sgd.html#stochastic-gradient-descent>`_. * `SGD API <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn-linear-model-sgdclassifier>`_. Guide for the approximate kernel algorithm (using the Nystroem method), types of kernels and appropriate parameters: * `Kernel Approximation User's Guide <http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_. * `Kernel Approximation API <http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_. """ _flavor = "approximate_kernel_SGD"
[docs] def classifier(self): """ Creates an approximate kernel SGD classifier. """ kernel = ("kernel", sklearn_kernel_approximation.Nystroem()) classifier = ("classifier", sklearn_linear_model.SGDClassifier()) return [kernel, classifier]
[docs]class ApproximateKernelSVM(SupervisedSklearnClassifier): """ A linear SVM based on scikit-learn, with a choice of an approximate kernel to transform nonlinear features into linear features suitable for the SVM classifier. Guide for using the linear SVM classifier: * `SVM User's Guide <http://scikit-learn.org/stable/modules/svm.html#support-vector-machines>`_. * `Linear SVM API <http://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC>`_. Guide for the approximate kernel algorithm (using the Nystroem method), types of kernels and appropriate parameters: * `Kernel Approximation User's Guide <http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_. * `Kernel Approximation API <http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_. """ _flavor = "approximate_kernel_SVM"
[docs] def classifier(self): """ Creates a approximate kernel SVM classifier. """ kernel = ("kernel", sklearn_kernel_approximation.Nystroem()) classifier = ("classifier", sklearn_svm.LinearSVC()) return [kernel, classifier]
[docs]class IncrementalNaiveBayes(IncrementalSupervisedSklearnClassifier): """ A Naive Bayes classifier based on scikit-learn. Trains in an incremental fashion. This is a supervised learning algorithm which assumes independence between all features, and uses Bayes' theorem to determine the posterior probability that a set of features is in a given class. In this particular implementation, the likelihood of features are Gaussian in form. * `Gaussian Naive Bayes User's Guide <http://scikit-learn.org/stable/modules/naive_bayes.html#gaussian-naive-bayes>`_. * `Gaussian Naive Bayes API <http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html#sklearn.naive_bayes.GaussianNB>`_. """ _flavor = "incremental_naive_bayes"
[docs] def classifier(self): """ Creates a Gaussian Naive Bayes classifier. """ return [("classifier", sklearn_naive_bayes.GaussianNB())]
[docs]class PassiveAggressive(IncrementalSupervisedSklearnClassifier): """ A Passive-Aggressive classifier based on scikit-learn. Trains in an incremental fashion. Based off of <http://jmlr.csail.mit.edu/papers/volume7/crammer06a/crammer06a.pdf>`_. * `Passive-Aggressive User's Guide <http://scikit-learn.org/stable/modules/linear_model.html#passive-aggressive>`_. * `Passive-Aggressive API <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html#sklearn-linear-model-passiveaggressiveclassifier>`_. """ _flavor = "passive_aggressive"
[docs] def classifier(self): """ Creates a Passive-Aggressive classifier. """ return [("classifier", sklearn_linear_model.PassiveAggressive())]
[docs]class IncrementalNeuralNetwork(IncrementalSupervisedSklearnClassifier): """ A Multilayer Perception (neural network) algorithm based on scikit-learn. Trains in an incremental fashion. This is a supervised learning algorithm which produces a shallow neural network of multiple layers with a choice of activation function for the hidden layers. It trains itself using backpropagation. * `MultiLayer Perceptron User's Guide <http://scikit-learn.org/stable/modules/neural_networks_supervised.html#multi-layer-perceptron>`_. * `MultiLayer Perceptron API <http://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html#sklearn.neural_network.MLPClassifier>`_. """ _flavor = "incremental_multilayer_perceptron"
[docs] def classifier(self): """ Creates a multilayer perceptron classifier. """ return [("classifier", sklearn_neural_network.MLPClassifier())]
[docs]class IncrementalApproximateKernelSGD(IncrementalSupervisedSklearnClassifier): """ A Stochastic Gradient Descent classifier based on scikit-learn, with a choice of an approximate kernel to transform nonlinear features into linear features suitable for the SDG classifier. Trains in an incremental fashion. Guide for using the Stochastic Gradient Descent classifier: * `SGD User's Guide <http://scikit-learn.org/stable/modules/sgd.html#stochastic-gradient-descent>`_. * `SGD API <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn-linear-model-sgdclassifier>`_. Guide for the approximate kernel algorithm (using the Nystroem method), types of kernels and appropriate parameters: * `Kernel Approximation User's Guide <http://scikit-learn.org/stable/modules/kernel_approximation.html#kernel-approximation>`_. * `Kernel Approximation API <http://scikit-learn.org/stable/modules/generated/sklearn.kernel_approximation.Nystroem.html#sklearn.kernel_approximation.Nystroem>`_. """ _flavor = "incremental_approximate_kernel_SGD"
[docs] def classifier(self): """ Creates a approximate kernel SGD classifier. """ kernel = ("kernel", sklearn_kernel_approximation.Nystroem()) classifier = ("classifier", sklearn_linear_model.SGDClassifier()) return [kernel, classifier]
# ------------------------------------------- # utilities for rank scaling def _rank_scaler(ranks, scale=1): return 0.5 * (np.tanh(ranks / scale) + 1) def _rank_scaler_scale(ranks, max_train_rank_scale=DEFAULT_MAX_TRAIN_RANK_SCALE): return np.max(np.abs(ranks)) / np.arctanh(2 * max_train_rank_scale - 1) @hookimpl def get_classifiers(): return { "sklearn:naive_bayes": NaiveBayes, "sklearn:random_forest": RandomForest, "sklearn:svm": SupportVectorMachine, "sklearn:gradient_boosted_tree": GradientBoostedTree, "sklearn:neural_network": NeuralNetwork, "sklearn:approx_kernel_sgd": ApproximateKernelSGD, } @hookimpl def get_incremental_classifiers(): return { "sklearn:inc_naive_bayes": IncrementalNaiveBayes, "sklearn:passive_aggressive": PassiveAggressive, "sklearn:inc_neural_network": IncrementalNeuralNetwork, "sklearn:inc_approx_kernel_sgd": IncrementalApproximateKernelSGD, }