World Happiness Ranking Predictor

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.dummy import DummyRegressor
from catboost import CatBoostRegressor, Pool
import re
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
# Import Hostel.csv data

df = pd.read_csv('data/world-happiness-report-2019.csv')

#Change column name to lowercase and replace space and dots with _
df.columns = [c.lower().replace(" ","_").replace('\n','_') for c in df.columns]
df.head()
country_(region) ladder sd_of_ladder positive_affect negative_affect social_support freedom corruption generosity log_of_gdp_per_capita healthy_life_expectancy
0 Finland 1 4 41.0 10.0 2.0 5.0 4.0 47.0 22.0 27.0
1 Denmark 2 13 24.0 26.0 4.0 6.0 3.0 22.0 14.0 23.0
2 Norway 3 8 16.0 29.0 3.0 3.0 8.0 11.0 7.0 12.0
3 Iceland 4 9 3.0 3.0 1.0 7.0 45.0 3.0 15.0 13.0
4 Netherlands 5 1 12.0 25.0 15.0 19.0 12.0 7.0 12.0 18.0
print(df.isnull().sum())

# df[df['log_of_gdp_per_capita'].isnull()]
# df.info()
df.describe()
# There are 156 countries in this dataset, from the min,max of each column, we can tell that the values are
# according to the rankings instead of a value.
country_(region)           0
ladder                     0
sd_of_ladder               0
positive_affect            1
negative_affect            1
social_support             1
freedom                    1
corruption                 8
generosity                 1
log_of_gdp_per_capita      4
healthy_life_expectancy    6
dtype: int64
ladder sd_of_ladder positive_affect negative_affect social_support freedom corruption generosity log_of_gdp_per_capita healthy_life_expectancy
count 156.000000 156.000000 155.000000 155.000000 155.000000 155.000000 148.000000 155.000000 152.000000 150.000000
mean 78.500000 78.500000 78.000000 78.000000 78.000000 78.000000 74.500000 78.000000 76.500000 75.500000
std 45.177428 45.177428 44.888751 44.888751 44.888751 44.888751 42.868014 44.888751 44.022721 43.445368
min 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
25% 39.750000 39.750000 39.500000 39.500000 39.500000 39.500000 37.750000 39.500000 38.750000 38.250000
50% 78.500000 78.500000 78.000000 78.000000 78.000000 78.000000 74.500000 78.000000 76.500000 75.500000
75% 117.250000 117.250000 116.500000 116.500000 116.500000 116.500000 111.250000 116.500000 114.250000 112.750000
max 156.000000 156.000000 155.000000 155.000000 155.000000 155.000000 148.000000 155.000000 152.000000 150.000000
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(), annot=True);

png

# Train test split
features = ['positive_affect', 'negative_affect', 'social_support','freedom','corruption','generosity',
           'log_of_gdp_per_capita', 'healthy_life_expectancy']

target = 'ladder'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
mapper = DataFrameMapper([
    (['positive_affect'], SimpleImputer(strategy='mean')),
    (['negative_affect'], SimpleImputer(strategy='mean')),
    (['social_support'], SimpleImputer(strategy='mean')),
    (['freedom'], SimpleImputer(strategy='mean')),
    (['corruption'], SimpleImputer(strategy='mean')),
    (['generosity'], SimpleImputer(strategy='mean')),
    (['log_of_gdp_per_capita'], SimpleImputer(strategy='mean')),
    (['healthy_life_expectancy'], SimpleImputer(strategy='mean'))
], df_out=True)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)
# Dummy Regressor as a naive model
dummy = DummyRegressor(strategy="median")
dummy.fit(X_train, y_train)
print(f'Score to beat on train set -> {dummy.score(X_train, y_train)}')
print(f'Score to beat on test set -> {dummy.score(X_test, y_test)}')
Score to beat on train set -> -0.0034563367373134923
Score to beat on test set -> -0.01558583079517084
# Ignore DeprecationWarning & convergenceWarning

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)
# Using linear, lasso and catboost models

# Create pipeline for linear regression

linear_params = {
    'fit_intercept' : [True, False],
    'normalize' : [False, True]
}

pipe_linear = make_pipeline(
    mapper,
    SelectFromModel(LinearRegression(), max_features=5),
    GridSearchCV(LinearRegression(), linear_params, cv=3)
)

pipe_linear.fit(X_train, y_train)
print(f"The score on train set -> {pipe_linear.score(X_train, y_train)}.")
print(f"The score on test set -> {pipe_linear.score(X_test, y_test)}.")
pipe_linear['gridsearchcv'].best_estimator_
The score on train set -> 0.8290497917570301.
The score on test set -> 0.6153386199555004.





LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)
df.tail(10)
country_(region) ladder sd_of_ladder positive_affect negative_affect social_support freedom corruption generosity log_of_gdp_per_capita healthy_life_expectancy
146 Haiti 147 111 142.0 119.0 146.0 152.0 48.0 20.0 138.0 125.0
147 Botswana 148 125 87.0 65.0 105.0 60.0 54.0 150.0 66.0 113.0
148 Syria 149 137 155.0 155.0 154.0 153.0 38.0 69.0 NaN 128.0
149 Malawi 150 132 129.0 110.0 150.0 65.0 64.0 109.0 147.0 119.0
150 Yemen 151 85 153.0 75.0 100.0 147.0 83.0 155.0 141.0 124.0
151 Rwanda 152 63 54.0 102.0 144.0 21.0 2.0 90.0 132.0 103.0
152 Tanzania 153 122 78.0 50.0 131.0 78.0 34.0 49.0 125.0 118.0
153 Afghanistan 154 25 152.0 133.0 151.0 155.0 136.0 137.0 134.0 139.0
154 Central African Republic 155 117 132.0 153.0 155.0 133.0 122.0 113.0 152.0 150.0
155 South Sudan 156 140 127.0 152.0 148.0 154.0 61.0 85.0 140.0 143.0
# Using South Sudan's variable to predict --> How the backend creates prediction
test = np.array([127,152,148,154,61,85,140,143]).reshape(1,-1)
tester = pd.DataFrame(data=test, columns=features)
result = pipe_linear.predict(tester)
result[0] = 148.6
print(result)

index = np.round(result[0])
print(index)
df['country_(region)'][index - 1] # --> Index is ladder minus 1
[148.6]
149.0





'Syria'
# Create pipeline for lasso regressor

lasso_params = {
    'alpha' : [0.1, 1, 5, 10, 20],
    'fit_intercept':[True,False],
    'random_state': [42]
}

pipe_lasso = make_pipeline(
    mapper,
#     SelectFromModel(LinearRegression(), max_features=7),
    GridSearchCV(Lasso(), lasso_params, cv=3)
)

pipe_lasso.fit(X_train, y_train)
print(f"The score on train set -> {pipe_lasso.score(X_train, y_train)}.")
print(f"The score on test set -> {pipe_lasso.score(X_test, y_test)}.")
pipe_lasso['gridsearchcv'].best_estimator_
The score on train set -> 0.8536292061235335.
The score on test set -> 0.6710755691634984.





Lasso(alpha=20, copy_X=True, fit_intercept=False, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=42,
      selection='cyclic', tol=0.0001, warm_start=False)
# Catboost
cat_features = features
print(cat_features)

cb = CatBoostRegressor(
    iterations=500,
    random_seed=42,
    learning_rate=0.1,
    early_stopping_rounds=20
)

cb.fit(
    Z_train, y_train,
    eval_set=(Z_test, y_test),
    plot=True
)
['positive_affect', 'negative_affect', 'social_support', 'freedom', 'corruption', 'generosity', 'log_of_gdp_per_capita', 'healthy_life_expectancy']




0:  learn: 45.3598602   test: 42.7754550    best: 42.7754550 (0)    total: 85.7ms   remaining: 42.8s
1:  learn: 44.9122021   test: 42.7634073    best: 42.7634073 (1)    total: 97.2ms   remaining: 24.2s
2:  learn: 44.6901016   test: 42.9243994    best: 42.7634073 (1)    total: 101ms    remaining: 16.7s
3:  learn: 44.4116697   test: 42.9927616    best: 42.7634073 (1)    total: 104ms    remaining: 13s
4:  learn: 44.1884245   test: 43.0449422    best: 42.7634073 (1)    total: 108ms    remaining: 10.7s
5:  learn: 43.9271426   test: 43.0432864    best: 42.7634073 (1)    total: 111ms    remaining: 9.13s
6:  learn: 43.3937721   test: 43.2417683    best: 42.7634073 (1)    total: 116ms    remaining: 8.19s
7:  learn: 42.9352427   test: 43.6081093    best: 42.7634073 (1)    total: 119ms    remaining: 7.35s
8:  learn: 42.6914296   test: 43.8173266    best: 42.7634073 (1)    total: 122ms    remaining: 6.68s
9:  learn: 41.9921409   test: 43.7553171    best: 42.7634073 (1)    total: 127ms    remaining: 6.22s
10: learn: 41.3418487   test: 43.6024675    best: 42.7634073 (1)    total: 131ms    remaining: 5.82s
11: learn: 41.0142511   test: 43.5048650    best: 42.7634073 (1)    total: 134ms    remaining: 5.44s
12: learn: 40.5056671   test: 43.6886770    best: 42.7634073 (1)    total: 137ms    remaining: 5.12s
13: learn: 40.2157179   test: 43.6707096    best: 42.7634073 (1)    total: 139ms    remaining: 4.83s
14: learn: 39.4477070   test: 43.9339406    best: 42.7634073 (1)    total: 142ms    remaining: 4.58s
15: learn: 38.6712261   test: 44.4431514    best: 42.7634073 (1)    total: 145ms    remaining: 4.39s
16: learn: 38.1985018   test: 44.3301780    best: 42.7634073 (1)    total: 151ms    remaining: 4.3s
17: learn: 37.8280146   test: 44.0791063    best: 42.7634073 (1)    total: 154ms    remaining: 4.13s
18: learn: 37.5405840   test: 44.1975842    best: 42.7634073 (1)    total: 157ms    remaining: 3.97s
19: learn: 36.9807034   test: 44.5076596    best: 42.7634073 (1)    total: 159ms    remaining: 3.81s
20: learn: 36.5721769   test: 44.4072676    best: 42.7634073 (1)    total: 161ms    remaining: 3.67s
21: learn: 36.0411183   test: 44.5712683    best: 42.7634073 (1)    total: 164ms    remaining: 3.56s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 42.76340726
bestIteration = 1

Shrink model to first 2 iterations.





<catboost.core.CatBoostRegressor at 0x7f6a6e20ad30>
y_train_pred = cb.predict(X_train)
y_test_pred = cb.predict(X_test)

print(f'The score on train set -> {r2_score(y_train, y_train_pred)}.')
print(f'The score on train set -> {r2_score(y_test, y_test_pred)}.')
The score on train set -> 0.03701180390587466.
The score on train set -> 0.021314961573651203.

The lasso model is chosen as the predictor because it has the highest score on test set and train set. You can visit the website here.