In [176]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
In [178]:
players_df = pd.read_csv('players.csv')
salaries_df = pd.read_csv('salaries_1985to2018.csv')
data_df = pd.merge(players_df, salaries_df, on= 'player_id')
In [180]:
data_df.head()
Out[180]:
| index_x | player_id | birthDate | birthPlace | career_AST | career_FG% | career_FG3% | career_FT% | career_G | career_PER | ... | position | shoots | weight | index_y | league | salary | season | season_end | season_start | team | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 0 | NBA | 395000 | 1990-91 | 1991 | 1990 | Portland Trail Blazers |
| 1 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 1 | NBA | 494000 | 1991-92 | 1992 | 1991 | Portland Trail Blazers |
| 2 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 2 | NBA | 500000 | 1992-93 | 1993 | 1992 | Boston Celtics |
| 3 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 3 | NBA | 805000 | 1993-94 | 1994 | 1993 | Boston Celtics |
| 4 | 0 | abdelal01 | 24-Jun-68 | Cairo, Egypt | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | ... | Power Forward | Right | 240lb | 4 | NBA | 650000 | 1994-95 | 1995 | 1994 | Sacramento Kings |
5 rows × 32 columns
In [182]:
data_df.columns
Out[182]:
Index(['index_x', 'player_id', 'birthDate', 'birthPlace', 'career_AST',
'career_FG%', 'career_FG3%', 'career_FT%', 'career_G', 'career_PER',
'career_PTS', 'career_TRB', 'career_WS', 'career_eFG%', 'college',
'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'height',
'highSchool', 'name', 'position', 'shoots', 'weight', 'index_y',
'league', 'salary', 'season', 'season_end', 'season_start', 'team'],
dtype='object')
In [184]:
data_df.isna().sum()
Out[184]:
index_x 0 player_id 0 birthDate 0 birthPlace 0 career_AST 0 career_FG% 0 career_FG3% 0 career_FT% 0 career_G 0 career_PER 0 career_PTS 0 career_TRB 0 career_WS 0 career_eFG% 0 college 1636 draft_pick 1902 draft_round 1902 draft_team 1902 draft_year 1902 height 0 highSchool 989 name 0 position 0 shoots 0 weight 0 index_y 0 league 0 salary 0 season 0 season_end 0 season_start 0 team 4 dtype: int64
In [186]:
cleaned_data_df = data_df.drop(['college', 'draft_pick', 'draft_round', 'draft_team', 'draft_year', 'highSchool'], axis = 1)
In [188]:
cleaned_data_df.isna().sum()
Out[188]:
index_x 0 player_id 0 birthDate 0 birthPlace 0 career_AST 0 career_FG% 0 career_FG3% 0 career_FT% 0 career_G 0 career_PER 0 career_PTS 0 career_TRB 0 career_WS 0 career_eFG% 0 height 0 name 0 position 0 shoots 0 weight 0 index_y 0 league 0 salary 0 season 0 season_end 0 season_start 0 team 4 dtype: int64
In [190]:
sns.scatterplot(x='career_PTS', y='salary', data=data_df)
Out[190]:
<Axes: xlabel='career_PTS', ylabel='salary'>
In [192]:
sns.scatterplot(x='career_G', y='salary', data=data_df)
Out[192]:
<Axes: xlabel='career_G', ylabel='salary'>
In [194]:
X = cleaned_data_df.drop(['player_id','season','height','birthDate','birthPlace','name','position','shoots','league','team'], axis= 1)
y = cleaned_data_df['salary']
In [196]:
X.dtypes
Out[196]:
index_x int64 career_AST float64 career_FG% object career_FG3% object career_FT% object career_G int64 career_PER object career_PTS float64 career_TRB object career_WS object career_eFG% object weight object index_y int64 salary int64 season_end int64 season_start int64 dtype: object
In [198]:
X.head()
Out[198]:
| index_x | career_AST | career_FG% | career_FG3% | career_FT% | career_G | career_PER | career_PTS | career_TRB | career_WS | career_eFG% | weight | index_y | salary | season_end | season_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | 5.7 | 3.3 | 4.8 | 50.2 | 240lb | 0 | 395000 | 1991 | 1990 |
| 1 | 0 | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | 5.7 | 3.3 | 4.8 | 50.2 | 240lb | 1 | 494000 | 1992 | 1991 |
| 2 | 0 | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | 5.7 | 3.3 | 4.8 | 50.2 | 240lb | 2 | 500000 | 1993 | 1992 |
| 3 | 0 | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | 5.7 | 3.3 | 4.8 | 50.2 | 240lb | 3 | 805000 | 1994 | 1993 |
| 4 | 0 | 0.3 | 50.2 | 0 | 70.1 | 256 | 13 | 5.7 | 3.3 | 4.8 | 50.2 | 240lb | 4 | 650000 | 1995 | 1994 |
In [200]:
X = X[X != '-']
In [202]:
X['weight'] = pd.to_numeric(X['weight'].str.replace('lb', '', regex=False))
In [204]:
from sklearn.preprocessing import StandardScaler
scaled = StandardScaler().fit_transform(X)
X_scaled = pd.DataFrame(scaled, columns = X.columns, index = X.index)
In [206]:
X_scaled.head()
Out[206]:
| index_x | career_AST | career_FG% | career_FG3% | career_FT% | career_G | career_PER | career_PTS | career_TRB | career_WS | career_eFG% | weight | index_y | salary | season_end | season_start | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.685683 | -0.998645 | 0.860319 | -2.031274 | -0.323247 | -0.962424 | -0.164893 | -0.663454 | -0.274361 | -0.790155 | 0.363881 | 0.788185 | -1.731929 | -0.661873 | -1.382465 | -1.382465 |
| 1 | -1.685683 | -0.998645 | 0.860319 | -2.031274 | -0.323247 | -0.962424 | -0.164893 | -0.663454 | -0.274361 | -0.790155 | 0.363881 | 0.788185 | -1.731684 | -0.638216 | -1.273347 | -1.273347 |
| 2 | -1.685683 | -0.998645 | 0.860319 | -2.031274 | -0.323247 | -0.962424 | -0.164893 | -0.663454 | -0.274361 | -0.790155 | 0.363881 | 0.788185 | -1.731439 | -0.636783 | -1.164229 | -1.164229 |
| 3 | -1.685683 | -0.998645 | 0.860319 | -2.031274 | -0.323247 | -0.962424 | -0.164893 | -0.663454 | -0.274361 | -0.790155 | 0.363881 | 0.788185 | -1.731195 | -0.563901 | -1.055111 | -1.055111 |
| 4 | -1.685683 | -0.998645 | 0.860319 | -2.031274 | -0.323247 | -0.962424 | -0.164893 | -0.663454 | -0.274361 | -0.790155 | 0.363881 | 0.788185 | -1.730950 | -0.600939 | -0.945992 | -0.945992 |
In [238]:
X_scaled = X_scaled.dropna()
y = y.loc[X_scaled.index]
In [240]:
X_scaled.isna().sum()
Out[240]:
index_x 0 career_AST 0 career_FG% 0 career_FG3% 0 career_FT% 0 career_G 0 career_PER 0 career_PTS 0 career_TRB 0 career_WS 0 career_eFG% 0 weight 0 index_y 0 salary 0 season_end 0 season_start 0 dtype: int64
In [242]:
y.isna().sum()
Out[242]:
0
In [244]:
y.head()
Out[244]:
0 395000 1 494000 2 500000 3 805000 4 650000 Name: salary, dtype: int64
In [246]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, test_size = 0.3, random_state = 7)
In [248]:
lr = linear_model.LinearRegression()
lr_model = lr.fit(X_train,y_train)
lr_model.score(X_test,y_test)
Out[248]:
1.0
In [250]:
y_pred = lr_model.predict(X_test)
In [272]:
from sklearn import metrics
rmse = metrics.mean_squared_error(y_test, y_pred, squared = False)
rmse
/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_regression.py:483: FutureWarning: 'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'. warnings.warn(
Out[272]:
4.887643689475138e-09
In [276]:
reg = LinearRegression().fit(X_scaled,y)
In [278]:
coefficients = pd.concat([pd.DataFrame(X_scaled.columns), pd.DataFrame(np.transpose(reg.coef_))], axis = 1)
In [282]:
coefficients = pd.concat([pd.DataFrame(X.columns), pd.DataFrame(np.transpose(reg.coef_))], axis = 1)
In [284]:
coefficients
Out[284]:
| 0 | 0 | |
|---|---|---|
| 0 | index_x | -4.947332e-09 |
| 1 | career_AST | 1.527951e-09 |
| 2 | career_FG% | 6.391474e-10 |
| 3 | career_FG3% | 5.902621e-10 |
| 4 | career_FT% | 5.147740e-10 |
| 5 | career_G | -1.023636e-09 |
| 6 | career_PER | 1.831609e-09 |
| 7 | career_PTS | -2.315119e-09 |
| 8 | career_TRB | -1.185072e-09 |
| 9 | career_WS | 1.040917e-09 |
| 10 | career_eFG% | 9.442829e-10 |
| 11 | weight | 1.982698e-10 |
| 12 | index_y | 4.481990e-09 |
| 13 | salary | 4.184898e+06 |
| 14 | season_end | 5.011316e-10 |
| 15 | season_start | -7.748895e-10 |
In [ ]: