In [109]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
In [111]:
players_df = pd.read_csv('players.csv')
salaries_df = pd.read_csv('salaries_1985to2018.csv')
data_df = pd.merge(players_df, salaries_df, on= 'player_id')
In [113]:
data_df.head()
Out[113]:
| index_x | player_id | birthDate | birthPlace | career_AST | career_FG% | career_FG3% | career_FT% | career_G | career_PER | ... | position | shoots | weight | index_y | league | salary | season | season_end | season_start | team | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 0 | NBA | 395000 | 1990-91 | 1991 | 1990 | Portland Trail Blazers |
| 1 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 1 | NBA | 494000 | 1991-92 | 1992 | 1991 | Portland Trail Blazers |
| 2 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 2 | NBA | 500000 | 1992-93 | 1993 | 1992 | Boston Celtics |
| 3 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 3 | NBA | 805000 | 1993-94 | 1994 | 1993 | Boston Celtics |
| 4 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 4 | NBA | 650000 | 1994-95 | 1995 | 1994 | Sacramento Kings |
5 rows × 32 columns
In [114]:
data_df.columns
Out[114]:
Index(['index_x', 'player_id', 'birthDate', 'birthPlace', 'career_AST',
'career_FG%', 'career_FG3%', 'career_FT%', 'career_G', 'career_PER',
'career_PTS', 'career_TRB', 'career_WS', 'career_eFG%', 'college',
'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'height',
'highSchool', 'name', 'position', 'shoots', 'weight', 'index_y',
'league', 'salary', 'season', 'season_end', 'season_start', 'team'],
dtype='object')
In [117]:
data_df.isna().sum()
Out[117]:
index_x 0 player_id 0 birthDate 0 birthPlace 0 career_AST 0 career_FG% 0 career_FG3% 0 career_FT% 0 career_G 0 career_PER 0 career_PTS 0 career_TRB 0 career_WS 0 career_eFG% 0 college 1636 draft_pick 1902 draft_round 1902 draft_team 1902 draft_year 1902 height 0 highSchool 989 name 0 position 0 shoots 0 weight 0 index_y 0 league 0 salary 0 season 0 season_end 0 season_start 0 team 4 dtype: int64
In [119]:
cleaned_data_df = data_df.drop(['college', 'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'highSchool'], axis = 1)
In [121]:
cleaned_data_df.isna().sum()
Out[121]:
index_x 0 player_id 0 birthDate 0 birthPlace 0 career_AST 0 career_FG% 0 career_FG3% 0 career_FT% 0 career_G 0 career_PER 0 career_PTS 0 career_TRB 0 career_WS 0 career_eFG% 0 height 0 name 0 position 0 shoots 0 weight 0 index_y 0 league 0 salary 0 season 0 season_end 0 season_start 0 team 4 dtype: int64
x = cleaned_data_df[['career_AST', 'career_FG%', 'career_PTS', 'career_FG3%','career_PER', 'career_TRB']] y = cleaned_data_df['salary']
In [183]:
x = cleaned_data_df[['career_AST','career_TRB','career_PTS','career_G']]
y = cleaned_data_df['salary']
In [185]:
x.head()
Out[185]:
| career_AST | career_TRB | career_PTS | career_G | |
|---|---|---|---|---|
| 0 | 0.3 | 3.3 | 5.7 | 256 |
| 1 | 0.3 | 3.3 | 5.7 | 256 |
| 2 | 0.3 | 3.3 | 5.7 | 256 |
| 3 | 0.3 | 3.3 | 5.7 | 256 |
| 4 | 0.3 | 3.3 | 5.7 | 256 |
In [187]:
y.head()
Out[187]:
0 395000 1 494000 2 500000 3 805000 4 650000 Name: salary, dtype: int64
In [189]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)
In [201]:
reg = LinearRegression().fit(x,y)
In [203]:
reg.coef_
Out[203]:
array([1.30123310e+05, 3.76853939e+05, 2.64915957e+05, 2.85582103e+02])
In [205]:
reg.score(x,y)
Out[205]:
0.2631573983962765
In [207]:
reg.coef_
Out[207]:
array([1.30123310e+05, 3.76853939e+05, 2.64915957e+05, 2.85582103e+02])
In [211]:
coefficients = pd.concat([pd.DataFrame(x.columns), pd.DataFrame(np.transpose(reg.coef_))], axis = 1)
In [213]:
coefficients
Out[213]:
| 0 | 0 | |
|---|---|---|
| 0 | career_AST | 130123.310205 |
| 1 | career_TRB | 376853.938933 |
| 2 | career_PTS | 264915.956753 |
| 3 | career_G | 285.582103 |