Scrape Data For NBA Win Prediction
from gazpacho import get, Soup
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import re
import datetime as DT
The url of the first 100 games for each season
# Base url for season 2017-2018
season_17_18_base_url = "https://www.basketball-reference.com/play-index/tgl_finder.cgi?request=1&match=game&lg_id=NBA&team_seed_cmp=eq&opp_seed_cmp=eq&year_min=2018&year_max=2018&is_range=N&game_num_type=team&order_by=date_game"
# Base url for season 2018-2019
season_18_19_base_url = "https://www.basketball-reference.com/play-index/tgl_finder.cgi?request=1&match=game&lg_id=NBA&team_seed_cmp=eq&opp_seed_cmp=eq&year_min=2019&year_max=2019&is_range=N&game_num_type=team&order_by=date_game"
def get_pbp_links(url):
html = get(url)
soup = Soup(html)
data = soup.find('td', {'data-stat': 'date_game'})
links = [l.find('a') for l in data]
pbp_links = ["https://www.basketball-reference.com" + l.attrs['href'].replace('scores/','scores/pbp/') for l in links]
return pbp_links
Create a new list to store the urls
pbp_links_17_18 = get_pbp_links(season_17_18_base_url)
Loop through the pages and insert the urls into the list created previously
for _ in tqdm(range(100, 2624, 100)):
offset = _
season_17_18_url = f"https://www.basketball-reference.com/play-index/tgl_finder.cgi?request=1&player=&match=game&lg_id=NBA&year_min=2018&year_max=2018&team_id=&opp_id=&is_range=N&is_playoffs=&round_id=&best_of=&team_seed=&opp_seed=&team_seed_cmp=eq&opp_seed_cmp=eq&game_num_type=team&game_num_min=&game_num_max=&game_month=&game_location=&game_result=&is_overtime=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=date_game&order_by_asc=&offset={offset}"
links = get_pbp_links(season_17_18_url)
pbp_links_17_18.extend(links)
time.sleep(1)
100%|██████████| 26/26 [01:13<00:00, 3.68s/it]
Even though there are 2624 links scraped but half of them are unique
len(pbp_links_17_18)
2624
pbp_links_17_18 = np.unique(pbp_links_17_18)
len(pbp_links_17_18)
1312
Repeat the same step for season 18-19
pbp_links_18_19 = get_pbp_links(season_18_19_base_url)
# loop through season 18_19 games
# change url offset
for _ in tqdm(range(100, 2624, 100)):
offset = _
season_18_19_url = f"https://www.basketball-reference.com/play-index/tgl_finder.cgi?request=1&player=&match=game&lg_id=NBA&year_min=2019&year_max=2019&team_id=&opp_id=&is_range=N&is_playoffs=&round_id=&best_of=&team_seed=&opp_seed=&team_seed_cmp=eq&opp_seed_cmp=eq&game_num_type=team&game_num_min=&game_num_max=&game_month=&game_location=&game_result=&is_overtime=&c1stat=&c1comp=&c1val=&c2stat=&c2comp=&c2val=&c3stat=&c3comp=&c3val=&c4stat=&c4comp=&c4val=&order_by=date_game&order_by_asc=&offset={offset}"
links = get_pbp_links(season_18_19_url)
pbp_links_18_19.extend(links)
time.sleep(1)
100%|██████████| 26/26 [01:47<00:00, 4.37s/it]
# Number of unique links
len(np.unique(pbp_links_18_19))
1312
pbp_links_18_19 = np.unique(pbp_links_18_19)
pbp_links_18_19
array(['https://www.basketball-reference.com/boxscores/pbp/201810160BOS.html',
'https://www.basketball-reference.com/boxscores/pbp/201810160GSW.html',
'https://www.basketball-reference.com/boxscores/pbp/201810170CHO.html',
...,
'https://www.basketball-reference.com/boxscores/pbp/201906070GSW.html',
'https://www.basketball-reference.com/boxscores/pbp/201906100TOR.html',
'https://www.basketball-reference.com/boxscores/pbp/201906130GSW.html'],
dtype='<U68')
Defining different methods to get play-by-play data from each game
def get_data(url):
scores, times = get_table_data(url)
df = pd.DataFrame({
'time': times,
'score': scores
})
df['time'] = df['time'].apply(lambda x:re.findall(r'[0-9]{1,2}:[0-9]{2}',x)[0])
df['time'] = time_played(df['time'],df['score'])
#remove duplicates
df.drop_duplicates(inplace=True)
# remove indicator of start/end of quarter
df = df[df['score'].str.contains('-')]
home, away = strip_score(df['score'])
df['home'] = home
df['away'] = away
df.drop('score',axis=1,inplace=True)
return df
append_score_data(pbp_links_17_18[0]).head()
time | home | away | |
---|---|---|---|
0 | 0.266667 | 0 | 2 |
1 | 0.550000 | 0 | 2 |
2 | 0.616667 | 0 | 2 |
3 | 0.650000 | 0 | 2 |
4 | 0.700000 | 0 | 2 |
What the function does:
Find <table> tag with id="pbp"
and from the table find <td> with class="center"
where the tag contains
information about scores and start/end of quarter.
Inside each table rows , the first contains the time.
So, append them into list and you get the scores and times list
def get_table_data(url):
html = get(url)
soup = Soup(html)
table = soup.find('table', {'id': 'pbp'})
table_data = table.find('td', {'class': 'center'})
scores = [t.text for t in table_data]
rows = table.find('tr',strict=True)
times = []
for i in rows:
try:
time = i.find('td', mode='first')
times.append(time.text)
except (IndexError, TypeError) as e:
pass
return scores, times
df = pd.DataFrame({
'time': times,
'score': scores
})
Use regex to find anything that is 2 numbers separated by ":" and the following 2 numbers
df['time'] = df['time'].apply(lambda x:re.findall(r'[0-9]{1,2}:[0-9]{2}',x)[0])
df['time'] = time_played(df['time'],df['score'])
Convert the time into DateTime object with the following code:
t = DT.datetime.strptime(t, '%M:%S')
Then using the other DateTime object q1 = DT.datetime(1900,1,1,0,12) to subract the converted DateTime
object to get a TimeDelta and use total_seconds()/60 to find the minutes played
def time_played(times,scores):
# Will make this neat when I have time
q1 = DT.datetime(1900,1,1,0,12)
q2 = DT.datetime(1900,1,1,0,24)
q3 = DT.datetime(1900,1,1,0,36)
q4 = DT.datetime(1900,1,1,0,48)
ot1 = DT.datetime(1900,1,1,0,53)
ot2 = DT.datetime(1900,1,1,0,58)
ot3 = DT.datetime(1900,1,1,1,3)
ot4 = DT.datetime(1900,1,1,1,8)
ot5 = DT.datetime(1900,1,1,1,13)
second_quart_counter = 0
third_quart_counter = 0
fourth_quart_counter = 0
ot_counter = 0
time_played = []
for t,s in zip(times,scores):
t = DT.datetime.strptime(t, '%M:%S')
if 'Start of 2nd quarter' in s:
second_quart_counter += 1
elif 'Start of 3rd quarter' in s:
third_quart_counter += 1
elif 'Start of 4th quarter' in s:
fourth_quart_counter += 1
elif ('Start' in s) & ('overtime' in s):
ot_counter += 1
elif 'End of 2nd quarter' in s:
second_quart_counter -= 1
elif 'End of 3rd quarter' in s:
third_quart_counter -= 1
elif 'End of 4th quarter' in s:
fourth_quart_counter -= 1
if second_quart_counter == 1:
t = (q2-t).total_seconds()/60
time_played.append(t)
elif third_quart_counter == 1:
t = (q3-t).total_seconds()/60
time_played.append(t)
elif fourth_quart_counter == 1:
t = (q4-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 1:
t = (ot1-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 2:
t = (ot2-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 3:
t = (ot3-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 4:
t = (ot4-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 5:
t = (ot5-t).total_seconds()/60
time_played.append(t)
else:
t = (q1-t).total_seconds()/60
time_played.append(t)
return time_played
Use regex to find anything that is up to 3 numbers ending with "-" and assign it to away team
Then find up to 3 number starting with "-" and assign it to home team
def strip_score(score):
home = []
away = []
for s in score:
h = re.findall(r'[0-9]{1,3}$',s)[0]
a = re.findall(r'^[0-9]{1,3}',s)[0]
home.append(h)
away.append(a)
return home, away
Determine whether home team win or lose and set that as the result(y) for the dataset
df['home'] = df['home'].astype(int)
df['away'] = df['away'].astype(int)
home_win = df['home'].iloc[-1] > df['away'].iloc[-1]
df['home_win'] = [home_win] * len(df['home'])
/home/nthock/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
Lastly, add the DataFrame for each game together.
information about scores and start/end of quarter.
Inside each table rows , the first contains the time.
So, append them into list and you get the scores and times list
def get_table_data(url):
html = get(url)
soup = Soup(html)
table = soup.find('table', {'id': 'pbp'})
table_data = table.find('td', {'class': 'center'})
scores = [t.text for t in table_data]
rows = table.find('tr',strict=True)
times = []
for i in rows:
try:
time = i.find('td', mode='first')
times.append(time.text)
except (IndexError, TypeError) as e:
pass
return scores, times
df = pd.DataFrame({
'time': times,
'score': scores
})
Use regex to find anything that is 2 numbers separated by ":" and the following 2 numbers
df['time'] = df['time'].apply(lambda x:re.findall(r'[0-9]{1,2}:[0-9]{2}',x)[0])
df['time'] = time_played(df['time'],df['score'])
Convert the time into DateTime object with the following code:
t = DT.datetime.strptime(t, '%M:%S')
Then using the other DateTime object q1 = DT.datetime(1900,1,1,0,12) to subract the converted DateTime
object to get a TimeDelta and use total_seconds()/60 to find the minutes played
def time_played(times,scores):
# Will make this neat when I have time
q1 = DT.datetime(1900,1,1,0,12)
q2 = DT.datetime(1900,1,1,0,24)
q3 = DT.datetime(1900,1,1,0,36)
q4 = DT.datetime(1900,1,1,0,48)
ot1 = DT.datetime(1900,1,1,0,53)
ot2 = DT.datetime(1900,1,1,0,58)
ot3 = DT.datetime(1900,1,1,1,3)
ot4 = DT.datetime(1900,1,1,1,8)
ot5 = DT.datetime(1900,1,1,1,13)
second_quart_counter = 0
third_quart_counter = 0
fourth_quart_counter = 0
ot_counter = 0
time_played = []
for t,s in zip(times,scores):
t = DT.datetime.strptime(t, '%M:%S')
if 'Start of 2nd quarter' in s:
second_quart_counter += 1
elif 'Start of 3rd quarter' in s:
third_quart_counter += 1
elif 'Start of 4th quarter' in s:
fourth_quart_counter += 1
elif ('Start' in s) & ('overtime' in s):
ot_counter += 1
elif 'End of 2nd quarter' in s:
second_quart_counter -= 1
elif 'End of 3rd quarter' in s:
third_quart_counter -= 1
elif 'End of 4th quarter' in s:
fourth_quart_counter -= 1
if second_quart_counter == 1:
t = (q2-t).total_seconds()/60
time_played.append(t)
elif third_quart_counter == 1:
t = (q3-t).total_seconds()/60
time_played.append(t)
elif fourth_quart_counter == 1:
t = (q4-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 1:
t = (ot1-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 2:
t = (ot2-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 3:
t = (ot3-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 4:
t = (ot4-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 5:
t = (ot5-t).total_seconds()/60
time_played.append(t)
else:
t = (q1-t).total_seconds()/60
time_played.append(t)
return time_played
Use regex to find anything that is up to 3 numbers ending with "-" and assign it to away team
Then find up to 3 number starting with "-" and assign it to home team
def strip_score(score):
home = []
away = []
for s in score:
h = re.findall(r'[0-9]{1,3}$',s)[0]
a = re.findall(r'^[0-9]{1,3}',s)[0]
home.append(h)
away.append(a)
return home, away
Determine whether home team win or lose and set that as the result(y) for the dataset
df['home'] = df['home'].astype(int)
df['away'] = df['away'].astype(int)
home_win = df['home'].iloc[-1] > df['away'].iloc[-1]
df['home_win'] = [home_win] * len(df['home'])
/home/nthock/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.
Lastly, add the DataFrame for each game together.
So, append them into list and you get the scores and times list
def get_table_data(url):
html = get(url)
soup = Soup(html)
table = soup.find('table', {'id': 'pbp'})
table_data = table.find('td', {'class': 'center'})
scores = [t.text for t in table_data]
rows = table.find('tr',strict=True)
times = []
for i in rows:
try:
time = i.find('td', mode='first')
times.append(time.text)
except (IndexError, TypeError) as e:
pass
return scores, times
df = pd.DataFrame({
'time': times,
'score': scores
})
Use regex to find anything that is 2 numbers separated by ":" and the following 2 numbers
df['time'] = df['time'].apply(lambda x:re.findall(r'[0-9]{1,2}:[0-9]{2}',x)[0])
df['time'] = time_played(df['time'],df['score'])
Convert the time into DateTime object with the following code:
t = DT.datetime.strptime(t, '%M:%S')
Then using the other DateTime object q1 = DT.datetime(1900,1,1,0,12) to subract the converted DateTime
object to get a TimeDelta and use total_seconds()/60 to find the minutes played
def time_played(times,scores):
# Will make this neat when I have time
q1 = DT.datetime(1900,1,1,0,12)
q2 = DT.datetime(1900,1,1,0,24)
q3 = DT.datetime(1900,1,1,0,36)
q4 = DT.datetime(1900,1,1,0,48)
ot1 = DT.datetime(1900,1,1,0,53)
ot2 = DT.datetime(1900,1,1,0,58)
ot3 = DT.datetime(1900,1,1,1,3)
ot4 = DT.datetime(1900,1,1,1,8)
ot5 = DT.datetime(1900,1,1,1,13)
second_quart_counter = 0
third_quart_counter = 0
fourth_quart_counter = 0
ot_counter = 0
time_played = []
for t,s in zip(times,scores):
t = DT.datetime.strptime(t, '%M:%S')
if 'Start of 2nd quarter' in s:
second_quart_counter += 1
elif 'Start of 3rd quarter' in s:
third_quart_counter += 1
elif 'Start of 4th quarter' in s:
fourth_quart_counter += 1
elif ('Start' in s) & ('overtime' in s):
ot_counter += 1
elif 'End of 2nd quarter' in s:
second_quart_counter -= 1
elif 'End of 3rd quarter' in s:
third_quart_counter -= 1
elif 'End of 4th quarter' in s:
fourth_quart_counter -= 1
if second_quart_counter == 1:
t = (q2-t).total_seconds()/60
time_played.append(t)
elif third_quart_counter == 1:
t = (q3-t).total_seconds()/60
time_played.append(t)
elif fourth_quart_counter == 1:
t = (q4-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 1:
t = (ot1-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 2:
t = (ot2-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 3:
t = (ot3-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 4:
t = (ot4-t).total_seconds()/60
time_played.append(t)
elif ot_counter == 5:
t = (ot5-t).total_seconds()/60
time_played.append(t)
else:
t = (q1-t).total_seconds()/60
time_played.append(t)
return time_played
Use regex to find anything that is up to 3 numbers ending with "-" and assign it to away team
Then find up to 3 number starting with "-" and assign it to home team
def strip_score(score):
home = []
away = []
for s in score:
h = re.findall(r'[0-9]{1,3}$',s)[0]
a = re.findall(r'^[0-9]{1,3}',s)[0]
home.append(h)
away.append(a)
return home, away
Determine whether home team win or lose and set that as the result(y) for the dataset
df['home'] = df['home'].astype(int)
df['away'] = df['away'].astype(int)
home_win = df['home'].iloc[-1] > df['away'].iloc[-1]
df['home_win'] = [home_win] * len(df['home'])
/home/nthock/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
"""Entry point for launching an IPython kernel.