How to use Dummy Regressor and Dummy Classifier

Dummy Regressor

There are 4 strategies we can use to as a predictor for the Dummy Regressor.

  • Mean(default) - Always use the mean of y_train as the prediction
  • Median - Always use the median of y_train as the prediction
  • Quantile - Use the y_train value of the percentile specified
  • Constant - Only use the constant specified as the prediction
  • import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import seaborn as sns
    import altair as alt
    alt.renderers.enable('notebook')
    
    from sklearn.dummy import DummyClassifier, DummyRegressor
    from sklearn.datasets import make_classification, make_regression
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    
    np.random.seed(42)
    
    # Create a simple regression problem
    X, y = make_regression(
        n_samples=500,
        n_features=5,
        n_informative=10,
        n_targets=1,
        bias=10.0,
        effective_rank=None,
        tail_strength=0.5,
        noise=155.0
    )
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # Create percentile of y_train dataframe for chart to display
    y_sort = np.sort(y_train)
    perc = []
    y_val = []
    
    for i in range(101):
        y_val.append(np.percentile(y_sort, i))
        perc.append(i)
    
    plot_y = pd.DataFrame({
        'y': y_val,
        'percentile': perc
    })
    

    Mean

    dummy_mean = DummyRegressor(strategy='mean')
    
    dummy_mean.fit(X_train, y_train)
    
    print(dummy_mean.predict([1, 0, 4]))
    print(y_train.mean())
    
    [14.55126695 14.55126695 14.55126695]
    14.551266954042632
    

    Constant

    dummy_constant = DummyRegressor(strategy='constant', constant=100)
    
    dummy_constant.fit(X_train, y_train)
    
    print(dummy_constant.predict([[0, 6, 7], 6, 2]))
    
    [100 100 100]
    

    Median

    dummy_median = DummyRegressor(strategy='median')
    
    dummy_median.fit(X_train, y_train)
    
    print(dummy_median.predict([1, 100, 400]))
    print(np.median(y_train))
    
    [14.57561847 14.57561847 14.57561847]
    14.575618468552392
    

    Quantile

    dummy_quantile = DummyRegressor(strategy='quantile', quantile=0.75)
    
    dummy_quantile.fit(X_train, y_train)
    
    print(dummy_quantile.predict([1, 5, 100000]))
    print(np.percentile(y_train, 75))
    
    [135.43852133 135.43852133 135.43852133]
    135.43852133253466
    

    We can use the chart below to see the value of prediction according to the quantile specified.

    strategy='median' is the same as quantile=0.5

    # Create a selection that chooses the nearest point & selects based on x-axis value
    base = alt.Chart(plot_y)
    
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['y'], empty='none')
    
    bar = base.mark_bar().encode(
        x='percentile',
        y='y'
    )
    
    selectors = base.mark_point().encode(
        x='percentile:Q',
        opacity=alt.value(0),
    ).add_selection(
        nearest
    )
    
    # Draw points on the line, and highlight based on selection
    points = bar.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # Draw text labels near the points, and highlight based on selection
    text = bar.mark_text(align='left', dx=5, dy=0).encode(
        text=alt.condition(nearest, 'percentile:Q', alt.value(' '))
    )
    
    text2 = bar.mark_text(align='left', dx=5, dy=-10).encode(
        text=alt.condition(nearest, 'y:Q', alt.value(' '))
    )
    
    # Draw a rule at the location of the selection
    rules = base.mark_rule(color='red').encode(
        x='percentile:Q',
    ).transform_filter(
        nearest
    )
    
    # Put the five layers into a chart and bind the data
    alt.layer(
        bar, selectors, points, rules, text, text2
    ).properties(
        width=600,
        height=600,
        title="Percentile of y_train values"
    )
    
    <vega.vegalite.VegaLite at 0x7ff33fecec18>
    

    Dummy Classifier

    There are 5 strategies we can use to as a predictor for the Dummy Regressor.

  • Stratified(Default) - Generates predictions based on the y_train's distribution
  • Most_frequent - Always use the mode of y_train as the prediction
  • Prior - Always predict the class that maximizes the y_train (like "most_frequent")
  • Uniform - Generates predictions uniformly at random
  • Constant - Always predicts a constant label that is provided by the user. This is useful for metrics that evaluate a non-majority class
  • Observe the probability of each prediction of each model. (Probability of 0.5 or above will make the prediction for the variable to be True)

    np.random.seed(42)
    
    # Create a simple classifier problem
    X, y = make_classification(
        n_samples=200,
        n_features=5,
        n_informative=3,
        n_redundant=0,
        n_classes=2
    )
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    

    Stratified

    dummy_stratified = DummyClassifier(strategy='stratified', random_state=42)
    
    dummy_stratified.fit(X_train, y_train)
    
    prob = dummy_stratified.predict_proba(X_test)
    
    df = pd.DataFrame(
        data=prob,
        columns=['0','1']
    )
    df.head(6)
    
    0 1
    0 1.0 0.0
    1 0.0 1.0
    2 0.0 1.0
    3 0.0 1.0
    4 1.0 0.0
    5 1.0 0.0

    Most_frequent

    dummy_mostfreq = DummyClassifier(strategy='most_frequent')
    
    dummy_mostfreq.fit(X_train, y_train)
    prob = dummy_mostfreq.predict_proba(X_test)
    
    df = pd.DataFrame(
        data=prob,
        columns=['0','1']
    )
    df.head(6)
    
    0 1
    0 1.0 0.0
    1 1.0 0.0
    2 1.0 0.0
    3 1.0 0.0
    4 1.0 0.0
    5 1.0 0.0

    Prior

    dummy_prior = DummyClassifier(strategy='prior', random_state=42)
    
    dummy_prior.fit(X_train, y_train)
    prob = dummy_prior.predict_proba(X_test)
    
    df = pd.DataFrame(
        data=prob,
        columns=['0','1']
    )
    df.head(6)
    
    0 1
    0 0.514925 0.485075
    1 0.514925 0.485075
    2 0.514925 0.485075
    3 0.514925 0.485075
    4 0.514925 0.485075
    5 0.514925 0.485075

    Uniform

    dummy_uniform = DummyClassifier(strategy='uniform', random_state=42)
    
    dummy_uniform.fit(X_train, y_train)
    
    prob = dummy_uniform.predict_proba(X_train)
    
    df = pd.DataFrame(
        data=prob,
        columns=['0','1']
    )
    df.head(6)
    
    0 1
    0 0.5 0.5
    1 0.5 0.5
    2 0.5 0.5
    3 0.5 0.5
    4 0.5 0.5
    5 0.5 0.5

    Constant

    dummy_constant = DummyClassifier(strategy='constant', constant=1, random_state=42)
    
    dummy_constant.fit(X_train, y_train)
    
    prob = dummy_constant.predict_proba(X_test)
    
    df = pd.DataFrame(
        data=prob,
        columns=['0','1']
    )
    df.head(6)
    
    0 1
    0 0.0 1.0
    1 0.0 1.0
    2 0.0 1.0
    3 0.0 1.0
    4 0.0 1.0
    5 0.0 1.0

    Dummy with a constant variable is a good baseline model for us to predict something that is a non-majority class because we can always predict the majority class and get a good score. So, we have to beat the baseline score of the dummy model to make sure our model is picking-up/identified something or else I might as well use the dummy model 😄

    Example

    np.random.seed(42)
    
    # Create a simple classification problem
    X, y = make_classification(
        n_samples=200,
        n_features=5,
        n_classes=2
    )
    
    #Modified y values
    y = [0] * 180              #90% - 0
    y.extend([1]* 20)          #10% - 1
    
    # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    from sklearn.linear_model import LogisticRegression
    
    logreg = LogisticRegression(solver='lbfgs')
    logreg.fit(X_train, y_train)
    print(f"Logistic Regression has mean accuracy of {logreg.score(X_test, y_test)}.")
    
    dummy_constant = DummyClassifier(strategy='constant', constant=0)
    dummy_constant.fit(X_train, y_train)
    print(f"Dummy Classifier has mean accuracy of {dummy_constant.score(X_test, y_test)}.")
    
    Logistic Regression has mean accuracy of 0.9393939393939394.
    Dummy Classifier has mean accuracy of 0.9393939393939394.
    

    Conclusion

    Dummy regressor and classifier with a good hyperparameter can be a good baseline model for your model compare with. If your model can't beat a naive model, then you have to tune your model more or use another model for the problem.