import pandas as pd
import numpy as np
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('data/ted_main.csv')
comments 0
description 0
duration 0
event 0
film_date 0
languages 0
main_speaker 0
name 0
num_speaker 0
published_date 0
ratings 0
related_talks 0
speaker_occupation 6
tags 0
title 0
url 0
views 0
dtype: int64
|
comments |
description |
duration |
event |
film_date |
languages |
main_speaker |
name |
num_speaker |
published_date |
ratings |
related_talks |
speaker_occupation |
tags |
title |
url |
views |
0 |
4553 |
Sir Ken Robinson makes an entertaining and pro... |
1164 |
TED2006 |
1140825600 |
60 |
Ken Robinson |
Ken Robinson: Do schools kill creativity? |
1 |
1151367060 |
[{'id': 7, 'name': 'Funny', 'count': 19645}, {... |
[{'id': 865, 'hero': 'https://pe.tedcdn.com/im... |
Author/educator |
['children', 'creativity', 'culture', 'dance',... |
Do schools kill creativity? |
https://www.ted.com/talks/ken_robinson_says_sc... |
47227110 |
1 |
265 |
With the same humor and humanity he exuded in ... |
977 |
TED2006 |
1140825600 |
43 |
Al Gore |
Al Gore: Averting the climate crisis |
1 |
1151367060 |
[{'id': 7, 'name': 'Funny', 'count': 544}, {'i... |
[{'id': 243, 'hero': 'https://pe.tedcdn.com/im... |
Climate advocate |
['alternative energy', 'cars', 'climate change... |
Averting the climate crisis |
https://www.ted.com/talks/al_gore_on_averting_... |
3200520 |
2 |
124 |
New York Times columnist David Pogue takes aim... |
1286 |
TED2006 |
1140739200 |
26 |
David Pogue |
David Pogue: Simplicity sells |
1 |
1151367060 |
[{'id': 7, 'name': 'Funny', 'count': 964}, {'i... |
[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... |
Technology columnist |
['computers', 'entertainment', 'interface desi... |
Simplicity sells |
https://www.ted.com/talks/david_pogue_says_sim... |
1636292 |
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2550 entries, 0 to 2549
Data columns (total 17 columns):
comments 2550 non-null int64
description 2550 non-null object
duration 2550 non-null int64
event 2550 non-null object
film_date 2550 non-null int64
languages 2550 non-null int64
main_speaker 2550 non-null object
name 2550 non-null object
num_speaker 2550 non-null int64
published_date 2550 non-null int64
ratings 2550 non-null object
related_talks 2550 non-null object
speaker_occupation 2544 non-null object
tags 2550 non-null object
title 2550 non-null object
url 2550 non-null object
views 2550 non-null int64
dtypes: int64(7), object(10)
memory usage: 338.8+ KB
# Using median of views as a marker for video virality
median = (np.median(df['views']))
median
df['view'] = df.apply(lambda x: 1 if x['views'] > median else 0, axis=1)
|
comments |
description |
duration |
event |
film_date |
languages |
main_speaker |
name |
num_speaker |
published_date |
ratings |
related_talks |
speaker_occupation |
tags |
title |
url |
views |
view |
0 |
4553 |
Sir Ken Robinson makes an entertaining and pro... |
1164 |
TED2006 |
1140825600 |
60 |
Ken Robinson |
Ken Robinson: Do schools kill creativity? |
1 |
1151367060 |
[{'id': 7, 'name': 'Funny', 'count': 19645}, {... |
[{'id': 865, 'hero': 'https://pe.tedcdn.com/im... |
Author/educator |
['children', 'creativity', 'culture', 'dance',... |
Do schools kill creativity? |
https://www.ted.com/talks/ken_robinson_says_sc... |
47227110 |
1 |
1 |
265 |
With the same humor and humanity he exuded in ... |
977 |
TED2006 |
1140825600 |
43 |
Al Gore |
Al Gore: Averting the climate crisis |
1 |
1151367060 |
[{'id': 7, 'name': 'Funny', 'count': 544}, {'i... |
[{'id': 243, 'hero': 'https://pe.tedcdn.com/im... |
Climate advocate |
['alternative energy', 'cars', 'climate change... |
Averting the climate crisis |
https://www.ted.com/talks/al_gore_on_averting_... |
3200520 |
1 |
2 |
124 |
New York Times columnist David Pogue takes aim... |
1286 |
TED2006 |
1140739200 |
26 |
David Pogue |
David Pogue: Simplicity sells |
1 |
1151367060 |
[{'id': 7, 'name': 'Funny', 'count': 964}, {'i... |
[{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... |
Technology columnist |
['computers', 'entertainment', 'interface desi... |
Simplicity sells |
https://www.ted.com/talks/david_pogue_says_sim... |
1636292 |
1 |
3 |
200 |
In an emotionally charged talk, MacArthur-winn... |
1116 |
TED2006 |
1140912000 |
35 |
Majora Carter |
Majora Carter: Greening the ghetto |
1 |
1151367060 |
[{'id': 3, 'name': 'Courageous', 'count': 760}... |
[{'id': 1041, 'hero': 'https://pe.tedcdn.com/i... |
Activist for environmental justice |
['MacArthur grant', 'activism', 'business', 'c... |
Greening the ghetto |
https://www.ted.com/talks/majora_carter_s_tale... |
1697550 |
1 |
4 |
593 |
You've never seen data presented like this. Wi... |
1190 |
TED2006 |
1140566400 |
48 |
Hans Rosling |
Hans Rosling: The best stats you've ever seen |
1 |
1151440680 |
[{'id': 9, 'name': 'Ingenious', 'count': 3202}... |
[{'id': 2056, 'hero': 'https://pe.tedcdn.com/i... |
Global health expert; data visionary |
['Africa', 'Asia', 'Google', 'demo', 'economic... |
The best stats you've ever seen |
https://www.ted.com/talks/hans_rosling_shows_t... |
12005869 |
1 |
df['clean_date'] = df['published_date'].apply(lambda x: datetime.datetime.utcfromtimestamp(x).date())
df['clean_date'] = pd.to_datetime(df['clean_date'])
df['published_day'] = [d.day_name() for d in df['clean_date']]
df['published_month'] = [d.month_name() for d in df['clean_date']]
plt.figure(figsize=(16, 6))
sns.barplot(x= 'published_day', y='views', data=df, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']);
plt.figure(figsize=(16, 6))
sns.barplot(x='published_month', y='views', data=df, order=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']);
Index(['comments', 'description', 'duration', 'event', 'film_date',
'languages', 'main_speaker', 'name', 'num_speaker', 'published_date',
'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title',
'url', 'views', 'view', 'clean_date', 'published_day',
'published_month'],
dtype='object')
X = df[['description', 'duration', 'languages', 'published_day', 'published_month', 'tags', 'title']]
y = df['view']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)
from sklearn.feature_extraction.text import TfidfVectorizer
mapper = DataFrameMapper([('description',[TfidfVectorizer(stop_words='english')]),
(['duration'], StandardScaler()),
(['languages'],StandardScaler()),
(['published_day'],LabelBinarizer()),
(['published_month'],LabelBinarizer()),
('tags',[TfidfVectorizer(stop_words='english')]),
('title',[TfidfVectorizer(stop_words='english')])], df_out=True)
DataFrameMapper(default=False, df_out=True,
features=[('description',
[TfidfVectorizer(analyzer='word', binary=False,
decode_error='strict',
dtype=<class 'numpy.float64'>,
encoding='utf-8', input='content',
lowercase=True, max_df=1.0,
max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2',
preprocessor=None, smooth_idf=True,
stop_words='english',
strip_accents=Non...
decode_error='strict',
dtype=<class 'numpy.float64'>,
encoding='utf-8', input='content',
lowercase=True, max_df=1.0,
max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2',
preprocessor=None, smooth_idf=True,
stop_words='english',
strip_accents=None,
sublinear_tf=False,
token_pattern='(?u)\\b\\w\\w+\\b',
tokenizer=None, use_idf=True,
vocabulary=None)])],
input_df=False, sparse=False)
Z_train = mapper.fit_transform(X_train)
Z_test = mapper.transform(X_test)
Z_train.head()
|
description_000 |
description_05049 |
description_08 |
description_10 |
description_100 |
description_1000 |
description_100mph |
description_10x |
description_11 |
description_110 |
... |
title_young |
title_youth |
title_youtube |
title_yup |
title_zap |
title_zero |
title_zika |
title_zodiac |
title_zone |
title_zulu |
591 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
462 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1036 |
0.0 |
0.0 |
0.0 |
0.153945 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
45 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1268 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
... |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
5 rows × 15296 columns
log_model = LogisticRegression(solver="liblinear")
log_model.fit(Z_train, y_train)
# y_pred = model.predict(Z_test)
# print(accuracy_score(y_test, y_pred))
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None, penalty='l2',
random_state=None, solver='liblinear', tol=0.0001, verbose=0,
warm_start=False)
print(log_model.score(Z_train, y_train))
print(log_model.score(Z_test, y_test))
0.9549180327868853
0.7292161520190024
import catboost as cb
model = cb.CatBoostClassifier()
Catboost model
from catboost import CatBoostClassifier
model = CatBoostClassifier(early_stopping_rounds= 20,
iterations=100,
random_seed=42,
learning_rate=0.5,
custom_loss=['AUC', 'Accuracy']
)
model.fit(
Z_train, y_train,
eval_set=(Z_test, y_test),
verbose=False,
)
<catboost.core.CatBoostClassifier at 0x7feb2452dda0>
{'learn': {'Accuracy': 0.8905152224824356, 'Logloss': 0.346167848570717},
'validation': {'Accuracy': 0.6793349168646081,
'Logloss': 0.5950624788419,
'AUC': 0.7483211207262959}}
from sklearn.model_selection import GridSearchCV
log_reg_params = {
'penalty' : ['l1', 'l2'],
'C': [0.1, 0.5, 1.0, 5, 10],
'random_state' : [42]
}
gs_log_reg=GridSearchCV(LogisticRegression(solver="liblinear"),param_grid = log_reg_params, cv = 5)
gs_log_reg.fit(Z_train, y_train)
GridSearchCV(cv=5, error_score='raise-deprecating',
estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
fit_intercept=True,
intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2',
random_state=None, solver='liblinear',
tol=0.0001, verbose=0,
warm_start=False),
iid='warn', n_jobs=None,
param_grid={'C': [0.1, 0.5, 1.0, 5, 10], 'penalty': ['l1', 'l2'],
'random_state': [42]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
best_est = gs_log_reg.best_estimator_
# Save the model into a pickle file for Flask app
pipe = make_pipeline(mapper, best_est)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)
pickle.dump(pipe, open('pipe.pkl', 'wb'))
You can find the TED Talk virality predictor app here