In [1]:
!pip install scikit-learn
Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.12/site-packages (1.4.2) Requirement already satisfied: numpy>=1.19.5 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.26.4) Requirement already satisfied: scipy>=1.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.13.1) Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (2.2.0)
In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [5]:
water_df = pd.read_csv('water_potability.csv')
In [7]:
water_df.head()
Out[7]:
| ph | Hardness | Solids | Chloramines | Sulfate | Conductivity | Organic_carbon | Trihalomethanes | Turbidity | Potability | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | 204.890455 | 20791.318981 | 7.300212 | 368.516441 | 564.308654 | 10.379783 | 86.990970 | 2.963135 | 0 |
| 1 | 3.716080 | 129.422921 | 18630.057858 | 6.635246 | NaN | 592.885359 | 15.180013 | 56.329076 | 4.500656 | 0 |
| 2 | 8.099124 | 224.236259 | 19909.541732 | 9.275884 | NaN | 418.606213 | 16.868637 | 66.420093 | 3.055934 | 0 |
| 3 | 8.316766 | 214.373394 | 22018.417441 | 8.059332 | 356.886136 | 363.266516 | 18.436524 | 100.341674 | 4.628771 | 0 |
| 4 | 9.092223 | 181.101509 | 17978.986339 | 6.546600 | 310.135738 | 398.410813 | 11.558279 | 31.997993 | 4.075075 | 0 |
In [9]:
water_df.columns
Out[9]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
dtype='object')
In [11]:
water_df.isna().sum()
Out[11]:
ph 491 Hardness 0 Solids 0 Chloramines 0 Sulfate 781 Conductivity 0 Organic_carbon 0 Trihalomethanes 162 Turbidity 0 Potability 0 dtype: int64
In [13]:
w_cleaned_df = water_df
In [15]:
w_cleaned_df['ph'] = water_df['ph'].fillna(water_df['ph'].mean())
w_cleaned_df['Sulfate'] = water_df['Sulfate'].fillna(water_df['Sulfate'].mean())
w_cleaned_df['Trihalomethanes'] = water_df['Trihalomethanes'].fillna(water_df['Trihalomethanes'].mean())
In [17]:
w_cleaned_df.isna().sum()
Out[17]:
ph 0 Hardness 0 Solids 0 Chloramines 0 Sulfate 0 Conductivity 0 Organic_carbon 0 Trihalomethanes 0 Turbidity 0 Potability 0 dtype: int64
In [19]:
x = w_cleaned_df.drop(['Potability'], axis = 1)
y = w_cleaned_df['Potability']
In [21]:
x.shape, y.shape
Out[21]:
((3276, 9), (3276,))
In [23]:
from sklearn.model_selection import train_test_split
In [25]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= .3, random_state=42)
In [27]:
x_train.shape
Out[27]:
(2293, 9)
In [29]:
from sklearn import tree
In [31]:
clf = tree.DecisionTreeClassifier(max_depth = 5)
clf = clf.fit(x_train, y_train)
predicted_tree = clf.predict(x_test)
In [33]:
clf.score(x_test, y_test)
Out[33]:
0.6510681586978637
In [35]:
y.value_counts()
Out[35]:
Potability 0 1998 1 1278 Name: count, dtype: int64
In [37]:
fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (48,12), dpi=300)
tree.plot_tree(clf, feature_names = x.columns, filled=True)
plt.show()
In [39]:
from sklearn.ensemble import RandomForestClassifier
In [41]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
In [43]:
rf.fit(x_train, y_train)
predicted_rf = rf.predict(x_test)
In [44]:
score = rf.score(x_test, y_test)
print(score)
0.6795523906408952
In [47]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
In [49]:
from sklearn.preprocessing import StandardScaler
In [51]:
scaler= StandardScaler()
In [53]:
x_train_scaled = scaler.fit_transform(x_train)
In [55]:
x_test_scaled = scaler.transform(x_test)
Testing Using Naive Bayes Model
In [58]:
from sklearn.naive_bayes import GaussianNB
In [60]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
Out[60]:
GaussianNB()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
In [62]:
predicted_nb = nb_model.predict(x_test)
In [64]:
from sklearn.neighbors import KNeighborsClassifier
In [66]:
knn_model = KNeighborsClassifier(n_neighbors=5)
In [68]:
knn_model.fit(x_train, y_train)
Out[68]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [70]:
predicted_knn = knn_model.predict(x_test)
Testing all Models aginst each other using: Accuracy / F-1 Score / Confusion Matrix
Calculate Accuracy
In [74]:
print('Decision Tree:', clf.score(x_test, y_test))
print('Random Forest:', score)
print('Naive Bayes:', accuracy_score(y_test, predicted_nb))
print('KNN:', accuracy_score(y_test, predicted_knn))
Decision Tree: 0.6510681586978637 Random Forest: 0.6795523906408952 Naive Bayes: 0.6358087487283826 KNN: 0.5483214649033571
Calculate F-1 Score
In [77]:
print('Decision Tree:', f1_score(y_test,predicted_tree))
print('Random Forest:', f1_score(y_test,predicted_rf))
print('Naive Bayes:', f1_score(y_test,predicted_nb))
print('KNN:', f1_score(y_test,predicted_knn))
Decision Tree: 0.2809224318658281 Random Forest: 0.4844517184942717 Naive Bayes: 0.3035019455252918 KNN: 0.3373134328358209
Confusion Matrix
In [80]:
from sklearn.metrics import confusion_matrix
In [82]:
print('Decision Tree:', confusion_matrix(y_test,predicted_tree))
print('Random Forest:', confusion_matrix(y_test,predicted_rf))
print('Naive Bayes:', confusion_matrix(y_test,predicted_nb))
print('KNN:', confusion_matrix(y_test,predicted_knn))
Decision Tree: [[573 44] [299 67]] Random Forest: [[520 97] [218 148]] Naive Bayes: [[547 70] [288 78]] KNN: [[426 191] [253 113]]
Accuracy
In [87]:
accuracy_scores = {'Naive Bayes': 0.64,'KNN': 0.55,'Decision Tree': 0.65,
'Random Forest': 0.68}
plt.figure(figsize=(8, 5))
plt.bar(accuracy_scores.keys(), accuracy_scores.values(), color=['blue', 'green', 'orange', 'purple'])
plt.title('Accuracy Comparison Between Models')
plt.xlabel('Models')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
plt.show()
F1 Score
In [90]:
f1_scores = {'Naive Bayes': 0.64, 'KNN': 0.55,'Decision Tree': 0.65,
'Random Forest': 0.68}
plt.figure(figsize=(8, 5))
plt.bar(f1_scores.keys(), f1_scores.values(), color=['blue', 'green', 'orange', 'purple'])
plt.title('F1-Score Comparison Between Models')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.ylim(0, 1)
plt.show()
Confusion Matrices
In [93]:
fig, axs = plt.subplots(2, 2, figsize=(12, 10))
sns.heatmap(confusion_matrix(y_test, predicted_nb), annot=True, fmt='d', cmap='Blues', ax=axs[0, 0])
axs[0, 0].set_title("Naive Bayes Confusion Matrix")
sns.heatmap(confusion_matrix(y_test, predicted_knn), annot=True, fmt='d', cmap='Greens', ax=axs[0, 1])
axs[0, 1].set_title("KNN Confusion Matrix")
sns.heatmap(confusion_matrix(y_test, predicted_tree), annot=True, fmt='d', cmap='Oranges', ax=axs[1, 0])
axs[1, 0].set_title("Decision Tree Confusion Matrix")
sns.heatmap(confusion_matrix(y_test, predicted_rf), annot=True, fmt='d', cmap='Purples', ax=axs[1, 1])
axs[1, 1].set_title("Random Forest Confusion Matrix")
plt.show()
In [ ]: