World Happiness Ranking Predictor

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.dummy import DummyRegressor
from catboost import CatBoostRegressor, Pool
import re
from sklearn.preprocessing import LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

# Import Hostel.csv data

df = pd.read_csv('data/world-happiness-report-2019.csv')

#Change column name to lowercase and replace space and dots with _
df.columns = [c.lower().replace(" ","_").replace('\n','_') for c in df.columns]
df.head()

	country_(region)	ladder	sd_of_ladder	positive_affect	negative_affect	social_support	freedom	corruption	generosity	log_of_gdp_per_capita	healthy_life_expectancy
0	Finland	1	4	41.0	10.0	2.0	5.0	4.0	47.0	22.0	27.0
1	Denmark	2	13	24.0	26.0	4.0	6.0	3.0	22.0	14.0	23.0
2	Norway	3	8	16.0	29.0	3.0	3.0	8.0	11.0	7.0	12.0
3	Iceland	4	9	3.0	3.0	1.0	7.0	45.0	3.0	15.0	13.0
4	Netherlands	5	1	12.0	25.0	15.0	19.0	12.0	7.0	12.0	18.0

print(df.isnull().sum())

# df[df['log_of_gdp_per_capita'].isnull()]
# df.info()
df.describe()
# There are 156 countries in this dataset, from the min,max of each column, we can tell that the values are
# according to the rankings instead of a value.

country_(region)           0
ladder                     0
sd_of_ladder               0
positive_affect            1
negative_affect            1
social_support             1
freedom                    1
corruption                 8
generosity                 1
log_of_gdp_per_capita      4
healthy_life_expectancy    6
dtype: int64

	ladder	sd_of_ladder	positive_affect	negative_affect	social_support	freedom	corruption	generosity	log_of_gdp_per_capita	healthy_life_expectancy
count	156.000000	156.000000	155.000000	155.000000	155.000000	155.000000	148.000000	155.000000	152.000000	150.000000
mean	78.500000	78.500000	78.000000	78.000000	78.000000	78.000000	74.500000	78.000000	76.500000	75.500000
std	45.177428	45.177428	44.888751	44.888751	44.888751	44.888751	42.868014	44.888751	44.022721	43.445368
min	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000
25%	39.750000	39.750000	39.500000	39.500000	39.500000	39.500000	37.750000	39.500000	38.750000	38.250000
50%	78.500000	78.500000	78.000000	78.000000	78.000000	78.000000	74.500000	78.000000	76.500000	75.500000
75%	117.250000	117.250000	116.500000	116.500000	116.500000	116.500000	111.250000	116.500000	114.250000	112.750000
max	156.000000	156.000000	155.000000	155.000000	155.000000	155.000000	148.000000	155.000000	152.000000	150.000000

plt.figure(figsize=(15,8))
sns.heatmap(df.corr(), annot=True);

png

# Train test split
features = ['positive_affect', 'negative_affect', 'social_support','freedom','corruption','generosity',
           'log_of_gdp_per_capita', 'healthy_life_expectancy']

target = 'ladder'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

mapper = DataFrameMapper([
    (['positive_affect'], SimpleImputer(strategy='mean')),
    (['negative_affect'], SimpleImputer(strategy='mean')),
    (['social_support'], SimpleImputer(strategy='mean')),
    (['freedom'], SimpleImputer(strategy='mean')),
    (['corruption'], SimpleImputer(strategy='mean')),
    (['generosity'], SimpleImputer(strategy='mean')),
    (['log_of_gdp_per_capita'], SimpleImputer(strategy='mean')),
    (['healthy_life_expectancy'], SimpleImputer(strategy='mean'))
], df_out=True)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)

# Dummy Regressor as a naive model
dummy = DummyRegressor(strategy="median")
dummy.fit(X_train, y_train)
print(f'Score to beat on train set -> {dummy.score(X_train, y_train)}')
print(f'Score to beat on test set -> {dummy.score(X_test, y_test)}')

Score to beat on train set -> -0.0034563367373134923
Score to beat on test set -> -0.01558583079517084

# Ignore DeprecationWarning & convergenceWarning

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Using linear, lasso and catboost models

# Create pipeline for linear regression

linear_params = {
    'fit_intercept' : [True, False],
    'normalize' : [False, True]
}

pipe_linear = make_pipeline(
    mapper,
    SelectFromModel(LinearRegression(), max_features=5),
    GridSearchCV(LinearRegression(), linear_params, cv=3)
)

pipe_linear.fit(X_train, y_train)
print(f"The score on train set -> {pipe_linear.score(X_train, y_train)}.")
print(f"The score on test set -> {pipe_linear.score(X_test, y_test)}.")
pipe_linear['gridsearchcv'].best_estimator_

The score on train set -> 0.8290497917570301.
The score on test set -> 0.6153386199555004.





LinearRegression(copy_X=True, fit_intercept=False, n_jobs=None, normalize=False)

df.tail(10)

	country_(region)	ladder	sd_of_ladder	positive_affect	negative_affect	social_support	freedom	corruption	generosity	log_of_gdp_per_capita	healthy_life_expectancy
146	Haiti	147	111	142.0	119.0	146.0	152.0	48.0	20.0	138.0	125.0
147	Botswana	148	125	87.0	65.0	105.0	60.0	54.0	150.0	66.0	113.0
148	Syria	149	137	155.0	155.0	154.0	153.0	38.0	69.0	NaN	128.0
149	Malawi	150	132	129.0	110.0	150.0	65.0	64.0	109.0	147.0	119.0
150	Yemen	151	85	153.0	75.0	100.0	147.0	83.0	155.0	141.0	124.0
151	Rwanda	152	63	54.0	102.0	144.0	21.0	2.0	90.0	132.0	103.0
152	Tanzania	153	122	78.0	50.0	131.0	78.0	34.0	49.0	125.0	118.0
153	Afghanistan	154	25	152.0	133.0	151.0	155.0	136.0	137.0	134.0	139.0
154	Central African Republic	155	117	132.0	153.0	155.0	133.0	122.0	113.0	152.0	150.0
155	South Sudan	156	140	127.0	152.0	148.0	154.0	61.0	85.0	140.0	143.0

# Using South Sudan's variable to predict --> How the backend creates prediction
test = np.array([127,152,148,154,61,85,140,143]).reshape(1,-1)
tester = pd.DataFrame(data=test, columns=features)
result = pipe_linear.predict(tester)

result[0] = 148.6
print(result)

index = np.round(result[0])
print(index)
df['country_(region)'][index - 1] # --> Index is ladder minus 1

[148.6]
149.0





'Syria'

# Create pipeline for lasso regressor

lasso_params = {
    'alpha' : [0.1, 1, 5, 10, 20],
    'fit_intercept':[True,False],
    'random_state': [42]
}

pipe_lasso = make_pipeline(
    mapper,
#     SelectFromModel(LinearRegression(), max_features=7),
    GridSearchCV(Lasso(), lasso_params, cv=3)
)

pipe_lasso.fit(X_train, y_train)
print(f"The score on train set -> {pipe_lasso.score(X_train, y_train)}.")
print(f"The score on test set -> {pipe_lasso.score(X_test, y_test)}.")
pipe_lasso['gridsearchcv'].best_estimator_

The score on train set -> 0.8536292061235335.
The score on test set -> 0.6710755691634984.





Lasso(alpha=20, copy_X=True, fit_intercept=False, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=42,
      selection='cyclic', tol=0.0001, warm_start=False)

# Catboost
cat_features = features
print(cat_features)

cb = CatBoostRegressor(
    iterations=500,
    random_seed=42,
    learning_rate=0.1,
    early_stopping_rounds=20
)

cb.fit(
    Z_train, y_train,
    eval_set=(Z_test, y_test),
    plot=True
)

['positive_affect', 'negative_affect', 'social_support', 'freedom', 'corruption', 'generosity', 'log_of_gdp_per_capita', 'healthy_life_expectancy']




0:  learn: 45.3598602   test: 42.7754550    best: 42.7754550 (0)    total: 85.7ms   remaining: 42.8s
1:  learn: 44.9122021   test: 42.7634073    best: 42.7634073 (1)    total: 97.2ms   remaining: 24.2s
2:  learn: 44.6901016   test: 42.9243994    best: 42.7634073 (1)    total: 101ms    remaining: 16.7s
3:  learn: 44.4116697   test: 42.9927616    best: 42.7634073 (1)    total: 104ms    remaining: 13s
4:  learn: 44.1884245   test: 43.0449422    best: 42.7634073 (1)    total: 108ms    remaining: 10.7s
5:  learn: 43.9271426   test: 43.0432864    best: 42.7634073 (1)    total: 111ms    remaining: 9.13s
6:  learn: 43.3937721   test: 43.2417683    best: 42.7634073 (1)    total: 116ms    remaining: 8.19s
7:  learn: 42.9352427   test: 43.6081093    best: 42.7634073 (1)    total: 119ms    remaining: 7.35s
8:  learn: 42.6914296   test: 43.8173266    best: 42.7634073 (1)    total: 122ms    remaining: 6.68s
9:  learn: 41.9921409   test: 43.7553171    best: 42.7634073 (1)    total: 127ms    remaining: 6.22s
10: learn: 41.3418487   test: 43.6024675    best: 42.7634073 (1)    total: 131ms    remaining: 5.82s
11: learn: 41.0142511   test: 43.5048650    best: 42.7634073 (1)    total: 134ms    remaining: 5.44s
12: learn: 40.5056671   test: 43.6886770    best: 42.7634073 (1)    total: 137ms    remaining: 5.12s
13: learn: 40.2157179   test: 43.6707096    best: 42.7634073 (1)    total: 139ms    remaining: 4.83s
14: learn: 39.4477070   test: 43.9339406    best: 42.7634073 (1)    total: 142ms    remaining: 4.58s
15: learn: 38.6712261   test: 44.4431514    best: 42.7634073 (1)    total: 145ms    remaining: 4.39s
16: learn: 38.1985018   test: 44.3301780    best: 42.7634073 (1)    total: 151ms    remaining: 4.3s
17: learn: 37.8280146   test: 44.0791063    best: 42.7634073 (1)    total: 154ms    remaining: 4.13s
18: learn: 37.5405840   test: 44.1975842    best: 42.7634073 (1)    total: 157ms    remaining: 3.97s
19: learn: 36.9807034   test: 44.5076596    best: 42.7634073 (1)    total: 159ms    remaining: 3.81s
20: learn: 36.5721769   test: 44.4072676    best: 42.7634073 (1)    total: 161ms    remaining: 3.67s
21: learn: 36.0411183   test: 44.5712683    best: 42.7634073 (1)    total: 164ms    remaining: 3.56s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 42.76340726
bestIteration = 1

Shrink model to first 2 iterations.





<catboost.core.CatBoostRegressor at 0x7f6a6e20ad30>

y_train_pred = cb.predict(X_train)
y_test_pred = cb.predict(X_test)

print(f'The score on train set -> {r2_score(y_train, y_train_pred)}.')
print(f'The score on train set -> {r2_score(y_test, y_test_pred)}.')

The score on train set -> 0.03701180390587466.
The score on train set -> 0.021314961573651203.

World Happiness Ranking Predictor

The lasso model is chosen as the predictor because it has the highest score on test set and train set. You can visit the website here.