Question

The bounty expires in 7 days. Answers to this question are eligible for a +50 reputation bounty. mmann1123 wants to draw more attention to this question:

Just need a working answer thanks!

I am trying to remove rows that are labeled outliers. I have this partially working, but not in the context of a pipeline and I am not sure why.

from sklearn.datasets import make_classification

X1, y1 = make_classification(n_samples=100, n_features=10, n_informative=5, n_classes=3)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import IsolationForest
import numpy as np


class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.05):
        self.contamination = contamination
        self.isolation_forest = IsolationForest(contamination=self.contamination)

    def fit(self, X, y=None):
        self.isolation_forest.fit(X)
        mask = self.isolation_forest.predict(X) == 1
        self.mask = mask
        return self

    def transform(self, X, y=None):
        if y is not None:
            return X[self.mask], y[self.mask]
        else:
            return X[self.mask]

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)


working = IsolationForestOutlierRemover().fit_transform(X1, y1)
working[0].shape
# 95 
working

# %%

pipelinet = Pipeline(
    [
        ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
        ("random_forest", RandomForestClassifier()),
    ]
)

notworking = pipelinet.fit(X1, y1)
notworking

Getting the following error:

ValueError                                Traceback (most recent call last)
/home/mmann1123/Documents/github/YM_TZ_crop_classifier/4_model.py in line 10
      349 # %%
      351 pipelinet = Pipeline(
      352     [
      353         ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
      354         ("random_forest", RandomForestClassifier()),
      355     ]
      356 )
---> 358 notworking = pipelinet.fit(X1, y1)
     359 notworking

File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/pipeline.py:406, in Pipeline.fit(self, X, y, **fit_params)
    404     if self._final_estimator != "passthrough":
    405         fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 406         self._final_estimator.fit(Xt, y, **fit_params_last_step)
    408 return self

File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:346, in BaseForest.fit(self, X, y, sample_weight)
    344 if issparse(y):
    345     raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 346 X, y = self._validate_data(
    347     X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
    348 )
...
--> 185     array = numpy.asarray(array, order=order, dtype=dtype)
    186     return xp.asarray(array, copy=copy)
    187 else:

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 95) + inhomogeneous part.

友情链接