TED Talk Viral Predictor

import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('data/ted_main.csv')
df.isnull().sum()
comments              0
description           0
duration              0
event                 0
film_date             0
languages             0
main_speaker          0
name                  0
num_speaker           0
published_date        0
ratings               0
related_talks         0
speaker_occupation    6
tags                  0
title                 0
url                   0
views                 0
dtype: int64
# Drop null
df.dropna()
df.head(3)
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator ['children', 'creativity', 'culture', 'dance',... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate ['alternative energy', 'cars', 'climate change... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520
2 124 New York Times columnist David Pogue takes aim... 1286 TED2006 1140739200 26 David Pogue David Pogue: Simplicity sells 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... Technology columnist ['computers', 'entertainment', 'interface desi... Simplicity sells https://www.ted.com/talks/david_pogue_says_sim... 1636292
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 17 columns):
comments              2550 non-null int64
description           2550 non-null object
duration              2550 non-null int64
event                 2550 non-null object
film_date             2550 non-null int64
languages             2550 non-null int64
main_speaker          2550 non-null object
name                  2550 non-null object
num_speaker           2550 non-null int64
published_date        2550 non-null int64
ratings               2550 non-null object
related_talks         2550 non-null object
speaker_occupation    2544 non-null object
tags                  2550 non-null object
title                 2550 non-null object
url                   2550 non-null object
views                 2550 non-null int64
dtypes: int64(7), object(10)
memory usage: 338.8+ KB
# Using median of views as a marker for video virality
median = (np.median(df['views']))
median

df['view'] = df.apply(lambda x: 1 if x['views'] > median else 0, axis=1)
df.head()
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views view
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator ['children', 'creativity', 'culture', 'dance',... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110 1
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate ['alternative energy', 'cars', 'climate change... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520 1
2 124 New York Times columnist David Pogue takes aim... 1286 TED2006 1140739200 26 David Pogue David Pogue: Simplicity sells 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... Technology columnist ['computers', 'entertainment', 'interface desi... Simplicity sells https://www.ted.com/talks/david_pogue_says_sim... 1636292 1
3 200 In an emotionally charged talk, MacArthur-winn... 1116 TED2006 1140912000 35 Majora Carter Majora Carter: Greening the ghetto 1 1151367060 [{'id': 3, 'name': 'Courageous', 'count': 760}... [{'id': 1041, 'hero': 'https://pe.tedcdn.com/i... Activist for environmental justice ['MacArthur grant', 'activism', 'business', 'c... Greening the ghetto https://www.ted.com/talks/majora_carter_s_tale... 1697550 1
4 593 You've never seen data presented like this. Wi... 1190 TED2006 1140566400 48 Hans Rosling Hans Rosling: The best stats you've ever seen 1 1151440680 [{'id': 9, 'name': 'Ingenious', 'count': 3202}... [{'id': 2056, 'hero': 'https://pe.tedcdn.com/i... Global health expert; data visionary ['Africa', 'Asia', 'Google', 'demo', 'economic... The best stats you've ever seen https://www.ted.com/talks/hans_rosling_shows_t... 12005869 1
df['clean_date'] = df['published_date'].apply(lambda x: datetime.datetime.utcfromtimestamp(x).date())
df['clean_date'] = pd.to_datetime(df['clean_date'])
df['published_day'] = [d.day_name() for d in df['clean_date']]
df['published_month'] = [d.month_name() for d in df['clean_date']]
plt.figure(figsize=(16, 6))
sns.barplot(x= 'published_day', y='views', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']);

png

plt.figure(figsize=(16, 6))
sns.barplot(x='published_month', y='views', data=df, order=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']);

png

df.columns
Index(['comments', 'description', 'duration', 'event', 'film_date',
       'languages', 'main_speaker', 'name', 'num_speaker', 'published_date',
       'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title',
       'url', 'views', 'view', 'clean_date', 'published_day',
       'published_month'],
      dtype='object')
X = df[['description', 'duration', 'languages', 'published_day', 'published_month', 'tags', 'title']]
y = df['view']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer
mapper = DataFrameMapper([('description',[TfidfVectorizer(stop_words='english')]),
                          (['duration'], StandardScaler()),
                          (['languages'],StandardScaler()),
                          (['published_day'],LabelBinarizer()),
                          (['published_month'],LabelBinarizer()),
                          ('tags',[TfidfVectorizer(stop_words='english')]),
                          ('title',[TfidfVectorizer(stop_words='english')])], df_out=True)
mapper.fit(X_train)
DataFrameMapper(default=False, df_out=True,
                features=[('description',
                           [TfidfVectorizer(analyzer='word', binary=False,
                                            decode_error='strict',
                                            dtype=<class 'numpy.float64'>,
                                            encoding='utf-8', input='content',
                                            lowercase=True, max_df=1.0,
                                            max_features=None, min_df=1,
                                            ngram_range=(1, 1), norm='l2',
                                            preprocessor=None, smooth_idf=True,
                                            stop_words='english',
                                            strip_accents=Non...
                                            decode_error='strict',
                                            dtype=<class 'numpy.float64'>,
                                            encoding='utf-8', input='content',
                                            lowercase=True, max_df=1.0,
                                            max_features=None, min_df=1,
                                            ngram_range=(1, 1), norm='l2',
                                            preprocessor=None, smooth_idf=True,
                                            stop_words='english',
                                            strip_accents=None,
                                            sublinear_tf=False,
                                            token_pattern='(?u)\\b\\w\\w+\\b',
                                            tokenizer=None, use_idf=True,
                                            vocabulary=None)])],
                input_df=False, sparse=False)
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)
Z_train.head()
description_000 description_05049 description_08 description_10 description_100 description_1000 description_100mph description_10x description_11 description_110 ... title_young title_youth title_youtube title_yup title_zap title_zero title_zika title_zodiac title_zone title_zulu
591 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
462 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1036 0.0 0.0 0.0 0.153945 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
45 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1268 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 15296 columns

log_model = LogisticRegression(solver="liblinear")
log_model.fit(Z_train, y_train)
# y_pred = model.predict(Z_test)
# print(accuracy_score(y_test, y_pred))
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
print(log_model.score(Z_train, y_train))
print(log_model.score(Z_test, y_test))
0.9549180327868853
0.7292161520190024
import catboost as cb
model = cb.CatBoostClassifier()

Catboost model

from catboost import CatBoostClassifier
model = CatBoostClassifier(early_stopping_rounds= 20,
    iterations=100,
    random_seed=42,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)
model.fit(
    Z_train, y_train,
    eval_set=(Z_test, y_test),
    verbose=False,
    )
<catboost.core.CatBoostClassifier at 0x7feb2452dda0>
model.best_score_
{'learn': {'Accuracy': 0.8905152224824356, 'Logloss': 0.346167848570717},
 'validation': {'Accuracy': 0.6793349168646081,
  'Logloss': 0.5950624788419,
  'AUC': 0.7483211207262959}}
from sklearn.model_selection import GridSearchCV

log_reg_params = {
    'penalty' : ['l1', 'l2'],
    'C': [0.1, 0.5, 1.0, 5, 10],
    'random_state' : [42]
}

gs_log_reg=GridSearchCV(LogisticRegression(solver="liblinear"),param_grid = log_reg_params, cv = 5)
gs_log_reg.fit(Z_train, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 0.5, 1.0, 5, 10], 'penalty': ['l1', 'l2'],
                         'random_state': [42]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)
best_est = gs_log_reg.best_estimator_
# Save the model into a pickle file for Flask app
pipe = make_pipeline(mapper, best_est)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
pickle.dump(pipe, open('pipe.pkl', 'wb'))

You can find the TED Talk virality predictor app here