TED Talk Viral Predictor

import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

df = pd.read_csv('data/ted_main.csv')

df.isnull().sum()

comments              0
description           0
duration              0
event                 0
film_date             0
languages             0
main_speaker          0
name                  0
num_speaker           0
published_date        0
ratings               0
related_talks         0
speaker_occupation    6
tags                  0
title                 0
url                   0
views                 0
dtype: int64

# Drop null
df.dropna()

df.head(3)

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	['children', 'creativity', 'culture', 'dance',...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	['alternative energy', 'cars', 'climate change...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520
2	124	New York Times columnist David Pogue takes aim...	1286	TED2006	1140739200	26	David Pogue	David Pogue: Simplicity sells	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...	[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...	Technology columnist	['computers', 'entertainment', 'interface desi...	Simplicity sells	https://www.ted.com/talks/david_pogue_says_sim...	1636292

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 17 columns):
comments              2550 non-null int64
description           2550 non-null object
duration              2550 non-null int64
event                 2550 non-null object
film_date             2550 non-null int64
languages             2550 non-null int64
main_speaker          2550 non-null object
name                  2550 non-null object
num_speaker           2550 non-null int64
published_date        2550 non-null int64
ratings               2550 non-null object
related_talks         2550 non-null object
speaker_occupation    2544 non-null object
tags                  2550 non-null object
title                 2550 non-null object
url                   2550 non-null object
views                 2550 non-null int64
dtypes: int64(7), object(10)
memory usage: 338.8+ KB

# Using median of views as a marker for video virality
median = (np.median(df['views']))
median

df['view'] = df.apply(lambda x: 1 if x['views'] > median else 0, axis=1)

df.head()

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views	view
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	['children', 'creativity', 'culture', 'dance',...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110	1
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	['alternative energy', 'cars', 'climate change...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520	1
2	124	New York Times columnist David Pogue takes aim...	1286	TED2006	1140739200	26	David Pogue	David Pogue: Simplicity sells	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 964}, {'i...	[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i...	Technology columnist	['computers', 'entertainment', 'interface desi...	Simplicity sells	https://www.ted.com/talks/david_pogue_says_sim...	1636292	1
3	200	In an emotionally charged talk, MacArthur-winn...	1116	TED2006	1140912000	35	Majora Carter	Majora Carter: Greening the ghetto	1	1151367060	[{'id': 3, 'name': 'Courageous', 'count': 760}...	[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i...	Activist for environmental justice	['MacArthur grant', 'activism', 'business', 'c...	Greening the ghetto	https://www.ted.com/talks/majora_carter_s_tale...	1697550	1
4	593	You've never seen data presented like this. Wi...	1190	TED2006	1140566400	48	Hans Rosling	Hans Rosling: The best stats you've ever seen	1	1151440680	[{'id': 9, 'name': 'Ingenious', 'count': 3202}...	[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i...	Global health expert; data visionary	['Africa', 'Asia', 'Google', 'demo', 'economic...	The best stats you've ever seen	https://www.ted.com/talks/hans_rosling_shows_t...	12005869	1

df['clean_date'] = df['published_date'].apply(lambda x: datetime.datetime.utcfromtimestamp(x).date())

df['clean_date'] = pd.to_datetime(df['clean_date'])

df['published_day'] = [d.day_name() for d in df['clean_date']]

df['published_month'] = [d.month_name() for d in df['clean_date']]

plt.figure(figsize=(16, 6))
sns.barplot(x= 'published_day', y='views', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']);

png

plt.figure(figsize=(16, 6))
sns.barplot(x='published_month', y='views', data=df, order=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']);

png

df.columns

Index(['comments', 'description', 'duration', 'event', 'film_date',
       'languages', 'main_speaker', 'name', 'num_speaker', 'published_date',
       'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title',
       'url', 'views', 'view', 'clean_date', 'published_day',
       'published_month'],
      dtype='object')

X = df[['description', 'duration', 'languages', 'published_day', 'published_month', 'tags', 'title']]
y = df['view']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
mapper = DataFrameMapper([('description',[TfidfVectorizer(stop_words='english')]),
                          (['duration'], StandardScaler()),
                          (['languages'],StandardScaler()),
                          (['published_day'],LabelBinarizer()),
                          (['published_month'],LabelBinarizer()),
                          ('tags',[TfidfVectorizer(stop_words='english')]),
                          ('title',[TfidfVectorizer(stop_words='english')])], df_out=True)

mapper.fit(X_train)

DataFrameMapper(default=False, df_out=True,
                features=[('description',
                           [TfidfVectorizer(analyzer='word', binary=False,
                                            decode_error='strict',
                                            dtype=<class 'numpy.float64'>,
                                            encoding='utf-8', input='content',
                                            lowercase=True, max_df=1.0,
                                            max_features=None, min_df=1,
                                            ngram_range=(1, 1), norm='l2',
                                            preprocessor=None, smooth_idf=True,
                                            stop_words='english',
                                            strip_accents=Non...
                                            decode_error='strict',
                                            dtype=<class 'numpy.float64'>,
                                            encoding='utf-8', input='content',
                                            lowercase=True, max_df=1.0,
                                            max_features=None, min_df=1,
                                            ngram_range=(1, 1), norm='l2',
                                            preprocessor=None, smooth_idf=True,
                                            stop_words='english',
                                            strip_accents=None,
                                            sublinear_tf=False,
                                            token_pattern='(?u)\\b\\w\\w+\\b',
                                            tokenizer=None, use_idf=True,
                                            vocabulary=None)])],
                input_df=False, sparse=False)

Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)
Z_train.head()

	description_10	...
591	0.000000	...
462	0.000000	...
1036	0.153945	...
45	0.000000	...
1268	0.000000	...

5 rows × 15296 columns

log_model = LogisticRegression(solver="liblinear")
log_model.fit(Z_train, y_train)
# y_pred = model.predict(Z_test)
# print(accuracy_score(y_test, y_pred))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

print(log_model.score(Z_train, y_train))
print(log_model.score(Z_test, y_test))

0.9549180327868853
0.7292161520190024

import catboost as cb
model = cb.CatBoostClassifier()

Catboost model

from catboost import CatBoostClassifier
model = CatBoostClassifier(early_stopping_rounds= 20,
    iterations=100,
    random_seed=42,
    learning_rate=0.5,
    custom_loss=['AUC', 'Accuracy']
)
model.fit(
    Z_train, y_train,
    eval_set=(Z_test, y_test),
    verbose=False,
    )

<catboost.core.CatBoostClassifier at 0x7feb2452dda0>

model.best_score_

{'learn': {'Accuracy': 0.8905152224824356, 'Logloss': 0.346167848570717},
 'validation': {'Accuracy': 0.6793349168646081,
  'Logloss': 0.5950624788419,
  'AUC': 0.7483211207262959}}

from sklearn.model_selection import GridSearchCV

log_reg_params = {
    'penalty' : ['l1', 'l2'],
    'C': [0.1, 0.5, 1.0, 5, 10],
    'random_state' : [42]
}

gs_log_reg=GridSearchCV(LogisticRegression(solver="liblinear"),param_grid = log_reg_params, cv = 5)

gs_log_reg.fit(Z_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='liblinear',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [0.1, 0.5, 1.0, 5, 10], 'penalty': ['l1', 'l2'],
                         'random_state': [42]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

best_est = gs_log_reg.best_estimator_

# Save the model into a pickle file for Flask app
pipe = make_pipeline(mapper, best_est)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
pickle.dump(pipe, open('pipe.pkl', 'wb'))

TED Talk Viral Predictor

You can find the TED Talk virality predictor app here