In [109]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
In [111]:
players_df = pd.read_csv('players.csv')
salaries_df = pd.read_csv('salaries_1985to2018.csv')
data_df = pd.merge(players_df, salaries_df, on= 'player_id')
In [113]:
data_df.head()
Out[113]:
index_x player_id birthDate birthPlace career_AST career_FG% career_FG3% career_FT% career_G career_PER ... position shoots weight index_y league salary season season_end season_start team
0 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 0 NBA 395000 1990-91 1991 1990 Portland Trail Blazers
1 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 1 NBA 494000 1991-92 1992 1991 Portland Trail Blazers
2 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 2 NBA 500000 1992-93 1993 1992 Boston Celtics
3 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 3 NBA 805000 1993-94 1994 1993 Boston Celtics
4 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 4 NBA 650000 1994-95 1995 1994 Sacramento Kings

5 rows × 32 columns

In [114]:
data_df.columns
Out[114]:
Index(['index_x', 'player_id', 'birthDate', 'birthPlace', 'career_AST',
       'career_FG%', 'career_FG3%', 'career_FT%', 'career_G', 'career_PER',
       'career_PTS', 'career_TRB', 'career_WS', 'career_eFG%', 'college',
       'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'height',
       'highSchool', 'name', 'position', 'shoots', 'weight', 'index_y',
       'league', 'salary', 'season', 'season_end', 'season_start', 'team'],
      dtype='object')
In [117]:
data_df.isna().sum()
Out[117]:
index_x            0
player_id          0
birthDate          0
birthPlace         0
career_AST         0
career_FG%         0
career_FG3%        0
career_FT%         0
career_G           0
career_PER         0
career_PTS         0
career_TRB         0
career_WS          0
career_eFG%        0
college         1636
draft_pick      1902
draft_round     1902
draft_team      1902
draft_year      1902
height             0
highSchool       989
name               0
position           0
shoots             0
weight             0
index_y            0
league             0
salary             0
season             0
season_end         0
season_start       0
team               4
dtype: int64
In [119]:
cleaned_data_df = data_df.drop(['college', 'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'highSchool'], axis = 1)
In [121]:
cleaned_data_df.isna().sum()
Out[121]:
index_x         0
player_id       0
birthDate       0
birthPlace      0
career_AST      0
career_FG%      0
career_FG3%     0
career_FT%      0
career_G        0
career_PER      0
career_PTS      0
career_TRB      0
career_WS       0
career_eFG%     0
height          0
name            0
position        0
shoots          0
weight          0
index_y         0
league          0
salary          0
season          0
season_end      0
season_start    0
team            4
dtype: int64

x = cleaned_data_df[['career_AST', 'career_FG%', 'career_PTS', 'career_FG3%','career_PER', 'career_TRB']] y = cleaned_data_df['salary']

In [183]:
x = cleaned_data_df[['career_AST','career_TRB','career_PTS','career_G']]
y = cleaned_data_df['salary']
In [185]:
x.head()
Out[185]:
career_AST career_TRB career_PTS career_G
0 0.3 3.3 5.7 256
1 0.3 3.3 5.7 256
2 0.3 3.3 5.7 256
3 0.3 3.3 5.7 256
4 0.3 3.3 5.7 256
In [187]:
y.head()
Out[187]:
0    395000
1    494000
2    500000
3    805000
4    650000
Name: salary, dtype: int64
In [189]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 42)
In [201]:
reg = LinearRegression().fit(x,y)
In [203]:
reg.coef_
Out[203]:
array([1.30123310e+05, 3.76853939e+05, 2.64915957e+05, 2.85582103e+02])
In [205]:
reg.score(x,y)
Out[205]:
0.2631573983962765
In [207]:
reg.coef_
Out[207]:
array([1.30123310e+05, 3.76853939e+05, 2.64915957e+05, 2.85582103e+02])
In [211]:
coefficients = pd.concat([pd.DataFrame(x.columns), pd.DataFrame(np.transpose(reg.coef_))], axis = 1)
In [213]:
coefficients
Out[213]:
0 0
0 career_AST 130123.310205
1 career_TRB 376853.938933
2 career_PTS 264915.956753
3 career_G 285.582103