bazarr/libs/ffsubsync/sklearn_shim.py

374 lines
13 KiB
Python

# -*- coding: utf-8 -*-
"""
This module borrows and adapts `Pipeline` from `sklearn.pipeline` and
`TransformerMixin` from `sklearn.base` in the scikit-learn framework
(commit hash d205638475ca542dc46862652e3bb0be663a8eac) to be precise).
Both are BSD licensed and allow for this sort of thing; attribution
is given as a comment above each class.
"""
from collections import defaultdict
from itertools import islice
# Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
# License: BSD 3 clause
class TransformerMixin(object):
"""Mixin class for all transformers."""
def fit_transform(self, X, y=None, **fit_params):
"""
Fit to data, then transform it.
Fits transformer to X and y with optional parameters fit_params
and returns a transformed version of X.
Parameters
----------
X : ndarray of shape (n_samples, n_features)
Training set.
y : ndarray of shape (n_samples,), default=None
Target values.
**fit_params : dict
Additional fit parameters.
Returns
-------
X_new : ndarray array of shape (n_samples, n_features_new)
Transformed array.
"""
# non-optimized default implementation; override when a better
# method is possible for a given clustering algorithm
if y is None:
# fit method of arity 1 (unsupervised transformation)
return self.fit(X, **fit_params).transform(X)
else:
# fit method of arity 2 (supervised transformation)
return self.fit(X, y, **fit_params).transform(X)
# Author: Edouard Duchesnay
# Gael Varoquaux
# Virgile Fritsch
# Alexandre Gramfort
# Lars Buitinck
# License: BSD
class Pipeline(object):
def __init__(self, steps, verbose=False):
self.steps = steps
self.verbose = verbose
self._validate_steps()
def _validate_steps(self):
names, estimators = zip(*self.steps)
# validate estimators
transformers = estimators[:-1]
estimator = estimators[-1]
for t in transformers:
if t is None or t == 'passthrough':
continue
if (not (hasattr(t, "fit") or hasattr(t, "fit_transform")) or not
hasattr(t, "transform")):
raise TypeError("All intermediate steps should be "
"transformers and implement fit and transform "
"or be the string 'passthrough' "
"'%s' (type %s) doesn't" % (t, type(t)))
# We allow last estimator to be None as an identity transformation
if (estimator is not None and estimator != 'passthrough'
and not hasattr(estimator, "fit")):
raise TypeError(
"Last step of Pipeline should implement fit "
"or be the string 'passthrough'. "
"'%s' (type %s) doesn't" % (estimator, type(estimator)))
def _iter(self, with_final=True, filter_passthrough=True):
"""
Generate (idx, (name, trans)) tuples from self.steps
When filter_passthrough is True, 'passthrough' and None transformers
are filtered out.
"""
stop = len(self.steps)
if not with_final:
stop -= 1
for idx, (name, trans) in enumerate(islice(self.steps, 0, stop)):
if not filter_passthrough:
yield idx, name, trans
elif trans is not None and trans != 'passthrough':
yield idx, name, trans
def __len__(self):
"""
Returns the length of the Pipeline
"""
return len(self.steps)
def __getitem__(self, ind):
"""Returns a sub-pipeline or a single esimtator in the pipeline
Indexing with an integer will return an estimator; using a slice
returns another Pipeline instance which copies a slice of this
Pipeline. This copy is shallow: modifying (or fitting) estimators in
the sub-pipeline will affect the larger pipeline and vice-versa.
However, replacing a value in `step` will not affect a copy.
"""
if isinstance(ind, slice):
if ind.step not in (1, None):
raise ValueError('Pipeline slicing only supports a step of 1')
return self.__class__(self.steps[ind])
try:
name, est = self.steps[ind]
except TypeError:
# Not an int, try get step by name
return self.named_steps[ind]
return est
@property
def _estimator_type(self):
return self.steps[-1][1]._estimator_type
@property
def named_steps(self):
return dict(self.steps)
@property
def _final_estimator(self):
estimator = self.steps[-1][1]
return 'passthrough' if estimator is None else estimator
def _log_message(self, step_idx):
if not self.verbose:
return None
name, step = self.steps[step_idx]
return '(step %d of %d) Processing %s' % (step_idx + 1,
len(self.steps),
name)
# Estimator interface
def _fit(self, X, y=None, **fit_params):
# shallow copy of steps - this should really be steps_
self.steps = list(self.steps)
self._validate_steps()
fit_params_steps = {name: {} for name, step in self.steps
if step is not None}
for pname, pval in fit_params.items():
if '__' not in pname:
raise ValueError(
"Pipeline.fit does not accept the {} parameter. "
"You can pass parameters to specific steps of your "
"pipeline using the stepname__parameter format, e.g. "
"`Pipeline.fit(X, y, logisticregression__sample_weight"
"=sample_weight)`.".format(pname))
step, param = pname.split('__', 1)
fit_params_steps[step][param] = pval
for (step_idx,
name,
transformer) in self._iter(with_final=False,
filter_passthrough=False):
if transformer is None or transformer == 'passthrough':
continue
# Fit or load from cache the current transformer
X, fitted_transformer = _fit_transform_one(
transformer, X, y, None,
**fit_params_steps[name])
# Replace the transformer of the step with the fitted
# transformer. This is necessary when loading the transformer
# from the cache.
self.steps[step_idx] = (name, fitted_transformer)
if self._final_estimator == 'passthrough':
return X, {}
return X, fit_params_steps[self.steps[-1][0]]
def fit(self, X, y=None, **fit_params):
"""Fit the model
Fit all the transforms one after the other and transform the
data, then fit the transformed data using the final estimator.
Parameters
----------
X : iterable
Training data. Must fulfill input requirements of first step of the
pipeline.
y : iterable, default=None
Training targets. Must fulfill label requirements for all steps of
the pipeline.
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of each step, where
each parameter name is prefixed such that parameter ``p`` for step
``s`` has key ``s__p``.
Returns
-------
self : Pipeline
This estimator
"""
Xt, fit_params = self._fit(X, y, **fit_params)
if self._final_estimator != 'passthrough':
self._final_estimator.fit(Xt, y, **fit_params)
return self
def fit_transform(self, X, y=None, **fit_params):
"""Fit the model and transform with the final estimator
Fits all the transforms one after the other and transforms the
data, then uses fit_transform on transformed data with the final
estimator.
Parameters
----------
X : iterable
Training data. Must fulfill input requirements of first step of the
pipeline.
y : iterable, default=None
Training targets. Must fulfill label requirements for all steps of
the pipeline.
**fit_params : dict of string -> object
Parameters passed to the ``fit`` method of each step, where
each parameter name is prefixed such that parameter ``p`` for step
``s`` has key ``s__p``.
Returns
-------
Xt : array-like of shape (n_samples, n_transformed_features)
Transformed samples
"""
last_step = self._final_estimator
Xt, fit_params = self._fit(X, y, **fit_params)
if last_step == 'passthrough':
return Xt
if hasattr(last_step, 'fit_transform'):
return last_step.fit_transform(Xt, y, **fit_params)
else:
return last_step.fit(Xt, y, **fit_params).transform(Xt)
@property
def transform(self):
"""Apply transforms, and transform with the final estimator
This also works where final estimator is ``None``: all prior
transformations are applied.
Parameters
----------
X : iterable
Data to transform. Must fulfill input requirements of first step
of the pipeline.
Returns
-------
Xt : array-like of shape (n_samples, n_transformed_features)
"""
# _final_estimator is None or has transform, otherwise attribute error
# XXX: Handling the None case means we can't use if_delegate_has_method
if self._final_estimator != 'passthrough':
self._final_estimator.transform
return self._transform
def _transform(self, X):
Xt = X
for _, _, transform in self._iter():
Xt = transform.transform(Xt)
return Xt
@property
def classes_(self):
return self.steps[-1][-1].classes_
@property
def _pairwise(self):
# check if first estimator expects pairwise input
return getattr(self.steps[0][1], '_pairwise', False)
@property
def n_features_in_(self):
# delegate to first step (which will call _check_is_fitted)
return self.steps[0][1].n_features_in_
def _name_estimators(estimators):
"""Generate names for estimators."""
names = [
estimator
if isinstance(estimator, str) else type(estimator).__name__.lower()
for estimator in estimators
]
namecount = defaultdict(int)
for est, name in zip(estimators, names):
namecount[name] += 1
for k, v in list(namecount.items()):
if v == 1:
del namecount[k]
for i in reversed(range(len(estimators))):
name = names[i]
if name in namecount:
names[i] += "-%d" % namecount[name]
namecount[name] -= 1
return list(zip(names, estimators))
def make_pipeline(*steps, **kwargs):
"""Construct a Pipeline from the given estimators.
This is a shorthand for the Pipeline constructor; it does not require, and
does not permit, naming the estimators. Instead, their names will be set
to the lowercase of their types automatically.
Parameters
----------
*steps : list of estimators.
verbose : bool, default=False
If True, the time elapsed while fitting each step will be printed as it
is completed.
Returns
-------
p : Pipeline
"""
verbose = kwargs.pop('verbose', False)
if kwargs:
raise TypeError('Unknown keyword arguments: "{}"'
.format(list(kwargs.keys())[0]))
return Pipeline(_name_estimators(steps), verbose=verbose)
def _transform_one(transformer, X, y, weight, **fit_params):
res = transformer.transform(X)
# if we have a weight for this transformer, multiply output
if weight is None:
return res
return res * weight
def _fit_transform_one(transformer,
X,
y,
weight,
**fit_params):
"""
Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned
with the fitted transformer. If ``weight`` is not ``None``, the result will
be multiplied by ``weight``.
"""
if hasattr(transformer, 'fit_transform'):
res = transformer.fit_transform(X, y, **fit_params)
else:
res = transformer.fit(X, y, **fit_params).transform(X)
if weight is None:
return res, transformer
return res * weight, transformer