In [176]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
In [178]:
players_df = pd.read_csv('players.csv')
salaries_df = pd.read_csv('salaries_1985to2018.csv')
data_df = pd.merge(players_df, salaries_df, on= 'player_id')
In [180]:
data_df.head()
Out[180]:
index_x player_id birthDate birthPlace career_AST career_FG% career_FG3% career_FT% career_G career_PER ... position shoots weight index_y league salary season season_end season_start team
0 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 0 NBA 395000 1990-91 1991 1990 Portland Trail Blazers
1 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 1 NBA 494000 1991-92 1992 1991 Portland Trail Blazers
2 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 2 NBA 500000 1992-93 1993 1992 Boston Celtics
3 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 3 NBA 805000 1993-94 1994 1993 Boston Celtics
4 0 abdelal01 24-Jun-68 Cairo, Egypt 0.3 50.2 0 70.1 256 13 ... Power Forward Right 240lb 4 NBA 650000 1994-95 1995 1994 Sacramento Kings

5 rows × 32 columns

In [182]:
data_df.columns
Out[182]:
Index(['index_x', 'player_id', 'birthDate', 'birthPlace', 'career_AST',
       'career_FG%', 'career_FG3%', 'career_FT%', 'career_G', 'career_PER',
       'career_PTS', 'career_TRB', 'career_WS', 'career_eFG%', 'college',
       'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'height',
       'highSchool', 'name', 'position', 'shoots', 'weight', 'index_y',
       'league', 'salary', 'season', 'season_end', 'season_start', 'team'],
      dtype='object')
In [184]:
data_df.isna().sum()
Out[184]:
index_x            0
player_id          0
birthDate          0
birthPlace         0
career_AST         0
career_FG%         0
career_FG3%        0
career_FT%         0
career_G           0
career_PER         0
career_PTS         0
career_TRB         0
career_WS          0
career_eFG%        0
college         1636
draft_pick      1902
draft_round     1902
draft_team      1902
draft_year      1902
height             0
highSchool       989
name               0
position           0
shoots             0
weight             0
index_y            0
league             0
salary             0
season             0
season_end         0
season_start       0
team               4
dtype: int64
In [186]:
cleaned_data_df = data_df.drop(['college', 'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'highSchool'], axis = 1)
In [188]:
cleaned_data_df.isna().sum()
Out[188]:
index_x         0
player_id       0
birthDate       0
birthPlace      0
career_AST      0
career_FG%      0
career_FG3%     0
career_FT%      0
career_G        0
career_PER      0
career_PTS      0
career_TRB      0
career_WS       0
career_eFG%     0
height          0
name            0
position        0
shoots          0
weight          0
index_y         0
league          0
salary          0
season          0
season_end      0
season_start    0
team            4
dtype: int64
In [190]:
sns.scatterplot(x='career_PTS', y='salary', data=data_df)
Out[190]:
<Axes: xlabel='career_PTS', ylabel='salary'>
No description has been provided for this image
In [192]:
sns.scatterplot(x='career_G', y='salary', data=data_df)
Out[192]:
<Axes: xlabel='career_G', ylabel='salary'>
No description has been provided for this image
In [194]:
X = cleaned_data_df.drop(['player_id','season','height','birthDate','birthPlace','name','position','shoots','league','team'], axis= 1)
y = cleaned_data_df['salary']
In [196]:
X.dtypes
Out[196]:
index_x           int64
career_AST      float64
career_FG%       object
career_FG3%      object
career_FT%       object
career_G          int64
career_PER       object
career_PTS      float64
career_TRB       object
career_WS        object
career_eFG%      object
weight           object
index_y           int64
salary            int64
season_end        int64
season_start      int64
dtype: object
In [198]:
X.head()
Out[198]:
index_x career_AST career_FG% career_FG3% career_FT% career_G career_PER career_PTS career_TRB career_WS career_eFG% weight index_y salary season_end season_start
0 0 0.3 50.2 0 70.1 256 13 5.7 3.3 4.8 50.2 240lb 0 395000 1991 1990
1 0 0.3 50.2 0 70.1 256 13 5.7 3.3 4.8 50.2 240lb 1 494000 1992 1991
2 0 0.3 50.2 0 70.1 256 13 5.7 3.3 4.8 50.2 240lb 2 500000 1993 1992
3 0 0.3 50.2 0 70.1 256 13 5.7 3.3 4.8 50.2 240lb 3 805000 1994 1993
4 0 0.3 50.2 0 70.1 256 13 5.7 3.3 4.8 50.2 240lb 4 650000 1995 1994
In [200]:
X = X[X != '-']
In [202]:
X['weight'] = pd.to_numeric(X['weight'].str.replace('lb', '', regex=False))
In [204]:
from sklearn.preprocessing import StandardScaler

scaled = StandardScaler().fit_transform(X)

X_scaled = pd.DataFrame(scaled, columns = X.columns, index = X.index)
In [206]:
X_scaled.head()
Out[206]:
index_x career_AST career_FG% career_FG3% career_FT% career_G career_PER career_PTS career_TRB career_WS career_eFG% weight index_y salary season_end season_start
0 -1.685683 -0.998645 0.860319 -2.031274 -0.323247 -0.962424 -0.164893 -0.663454 -0.274361 -0.790155 0.363881 0.788185 -1.731929 -0.661873 -1.382465 -1.382465
1 -1.685683 -0.998645 0.860319 -2.031274 -0.323247 -0.962424 -0.164893 -0.663454 -0.274361 -0.790155 0.363881 0.788185 -1.731684 -0.638216 -1.273347 -1.273347
2 -1.685683 -0.998645 0.860319 -2.031274 -0.323247 -0.962424 -0.164893 -0.663454 -0.274361 -0.790155 0.363881 0.788185 -1.731439 -0.636783 -1.164229 -1.164229
3 -1.685683 -0.998645 0.860319 -2.031274 -0.323247 -0.962424 -0.164893 -0.663454 -0.274361 -0.790155 0.363881 0.788185 -1.731195 -0.563901 -1.055111 -1.055111
4 -1.685683 -0.998645 0.860319 -2.031274 -0.323247 -0.962424 -0.164893 -0.663454 -0.274361 -0.790155 0.363881 0.788185 -1.730950 -0.600939 -0.945992 -0.945992
In [238]:
X_scaled = X_scaled.dropna()
y = y.loc[X_scaled.index]
In [240]:
X_scaled.isna().sum()
Out[240]:
index_x         0
career_AST      0
career_FG%      0
career_FG3%     0
career_FT%      0
career_G        0
career_PER      0
career_PTS      0
career_TRB      0
career_WS       0
career_eFG%     0
weight          0
index_y         0
salary          0
season_end      0
season_start    0
dtype: int64
In [242]:
y.isna().sum()
Out[242]:
0
In [244]:
y.head()
Out[244]:
0    395000
1    494000
2    500000
3    805000
4    650000
Name: salary, dtype: int64
In [246]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size = 0.3, random_state = 7)
In [248]:
lr = linear_model.LinearRegression()

lr_model = lr.fit(X_train,y_train)

lr_model.score(X_test,y_test)
Out[248]:
1.0
In [250]:
y_pred = lr_model.predict(X_test)
In [272]:
from sklearn import metrics
rmse = metrics.mean_squared_error(y_test, y_pred, squared = False)
rmse
/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_regression.py:483: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.
  warnings.warn(
Out[272]:
4.887643689475138e-09
In [276]:
reg = LinearRegression().fit(X_scaled,y)
In [278]:
coefficients = pd.concat([pd.DataFrame(X_scaled.columns), pd.DataFrame(np.transpose(reg.coef_))], axis = 1)
In [282]:
coefficients = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(np.transpose(reg.coef_))], axis = 1)
In [284]:
coefficients
Out[284]:
0 0
0 index_x -4.947332e-09
1 career_AST 1.527951e-09
2 career_FG% 6.391474e-10
3 career_FG3% 5.902621e-10
4 career_FT% 5.147740e-10
5 career_G -1.023636e-09
6 career_PER 1.831609e-09
7 career_PTS -2.315119e-09
8 career_TRB -1.185072e-09
9 career_WS 1.040917e-09
10 career_eFG% 9.442829e-10
11 weight 1.982698e-10
12 index_y 4.481990e-09
13 salary 4.184898e+06
14 season_end 5.011316e-10
15 season_start -7.748895e-10
In [ ]: