In [1]:
!pip install scikit-learn
Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.12/site-packages (1.4.2)
Requirement already satisfied: numpy>=1.19.5 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (2.2.0)
In [2]:
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
In [5]:
water_df = pd.read_csv('water_potability.csv')
In [7]:
water_df.head()
Out[7]:
ph Hardness Solids Chloramines Sulfate Conductivity Organic_carbon Trihalomethanes Turbidity Potability
0 NaN 204.890455 20791.318981 7.300212 368.516441 564.308654 10.379783 86.990970 2.963135 0
1 3.716080 129.422921 18630.057858 6.635246 NaN 592.885359 15.180013 56.329076 4.500656 0
2 8.099124 224.236259 19909.541732 9.275884 NaN 418.606213 16.868637 66.420093 3.055934 0
3 8.316766 214.373394 22018.417441 8.059332 356.886136 363.266516 18.436524 100.341674 4.628771 0
4 9.092223 181.101509 17978.986339 6.546600 310.135738 398.410813 11.558279 31.997993 4.075075 0
In [9]:
water_df.columns
Out[9]:
Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')
In [11]:
water_df.isna().sum()
Out[11]:
ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64
In [13]:
w_cleaned_df = water_df
In [15]:
w_cleaned_df['ph'] = water_df['ph'].fillna(water_df['ph'].mean())
w_cleaned_df['Sulfate'] = water_df['Sulfate'].fillna(water_df['Sulfate'].mean())
w_cleaned_df['Trihalomethanes'] = water_df['Trihalomethanes'].fillna(water_df['Trihalomethanes'].mean())
In [17]:
w_cleaned_df.isna().sum()
Out[17]:
ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64
In [19]:
x = w_cleaned_df.drop(['Potability'], axis = 1)
y = w_cleaned_df['Potability']
In [21]:
x.shape, y.shape
Out[21]:
((3276, 9), (3276,))
In [23]:
from sklearn.model_selection import train_test_split
In [25]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= .3, random_state=42)
In [27]:
x_train.shape
Out[27]:
(2293, 9)
In [29]:
from sklearn import tree
In [31]:
clf = tree.DecisionTreeClassifier(max_depth = 5)
clf = clf.fit(x_train, y_train)
predicted_tree = clf.predict(x_test)
In [33]:
clf.score(x_test, y_test)
Out[33]:
0.6510681586978637
In [35]:
y.value_counts()
Out[35]:
Potability
0    1998
1    1278
Name: count, dtype: int64
In [37]:
fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (48,12), dpi=300)
tree.plot_tree(clf, feature_names = x.columns, filled=True)
plt.show() 
No description has been provided for this image
In [39]:
from sklearn.ensemble import RandomForestClassifier
In [41]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)
In [43]:
rf.fit(x_train, y_train)
predicted_rf = rf.predict(x_test)
In [44]:
score = rf.score(x_test, y_test)
print(score)
0.6795523906408952
In [47]:
from sklearn.metrics import accuracy_score, classification_report, f1_score
In [49]:
from sklearn.preprocessing import StandardScaler
In [51]:
scaler= StandardScaler()
In [53]:
x_train_scaled = scaler.fit_transform(x_train)
In [55]:
x_test_scaled = scaler.transform(x_test)

Testing Using Naive Bayes Model

In [58]:
from sklearn.naive_bayes import GaussianNB
In [60]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)
Out[60]:
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
In [62]:
predicted_nb = nb_model.predict(x_test)
In [64]:
from sklearn.neighbors import KNeighborsClassifier
In [66]:
knn_model = KNeighborsClassifier(n_neighbors=5)
In [68]:
knn_model.fit(x_train, y_train)
Out[68]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [70]:
predicted_knn = knn_model.predict(x_test)

Testing all Models aginst each other using: Accuracy / F-1 Score / Confusion Matrix

Calculate Accuracy

In [74]:
print('Decision Tree:', clf.score(x_test, y_test))
print('Random Forest:', score)
print('Naive Bayes:', accuracy_score(y_test, predicted_nb))
print('KNN:', accuracy_score(y_test, predicted_knn))
Decision Tree: 0.6510681586978637
Random Forest: 0.6795523906408952
Naive Bayes: 0.6358087487283826
KNN: 0.5483214649033571

Calculate F-1 Score

In [77]:
print('Decision Tree:', f1_score(y_test,predicted_tree))
print('Random Forest:', f1_score(y_test,predicted_rf))
print('Naive Bayes:', f1_score(y_test,predicted_nb))
print('KNN:', f1_score(y_test,predicted_knn))
Decision Tree: 0.2809224318658281
Random Forest: 0.4844517184942717
Naive Bayes: 0.3035019455252918
KNN: 0.3373134328358209

Confusion Matrix

In [80]:
from sklearn.metrics import confusion_matrix
In [82]:
print('Decision Tree:', confusion_matrix(y_test,predicted_tree))
print('Random Forest:', confusion_matrix(y_test,predicted_rf))
print('Naive Bayes:', confusion_matrix(y_test,predicted_nb))
print('KNN:', confusion_matrix(y_test,predicted_knn))
Decision Tree: [[573  44]
 [299  67]]
Random Forest: [[520  97]
 [218 148]]
Naive Bayes: [[547  70]
 [288  78]]
KNN: [[426 191]
 [253 113]]

Accuracy

In [87]:
accuracy_scores = {'Naive Bayes': 0.64,'KNN': 0.55,'Decision Tree': 0.65,
    'Random Forest': 0.68}

plt.figure(figsize=(8, 5))
plt.bar(accuracy_scores.keys(), accuracy_scores.values(), color=['blue', 'green', 'orange', 'purple'])
plt.title('Accuracy Comparison Between Models')
plt.xlabel('Models')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
plt.show()
No description has been provided for this image

F1 Score

In [90]:
f1_scores = {'Naive Bayes': 0.64,  'KNN': 0.55,'Decision Tree': 0.65,
    'Random Forest': 0.68}
plt.figure(figsize=(8, 5))
plt.bar(f1_scores.keys(), f1_scores.values(), color=['blue', 'green', 'orange', 'purple'])
plt.title('F1-Score Comparison Between Models')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.ylim(0, 1)
plt.show()
No description has been provided for this image

Confusion Matrices

In [93]:
fig, axs = plt.subplots(2, 2, figsize=(12, 10))

sns.heatmap(confusion_matrix(y_test, predicted_nb), annot=True, fmt='d', cmap='Blues', ax=axs[0, 0])
axs[0, 0].set_title("Naive Bayes Confusion Matrix")

sns.heatmap(confusion_matrix(y_test, predicted_knn), annot=True, fmt='d', cmap='Greens', ax=axs[0, 1])
axs[0, 1].set_title("KNN Confusion Matrix")

sns.heatmap(confusion_matrix(y_test, predicted_tree), annot=True, fmt='d', cmap='Oranges', ax=axs[1, 0])
axs[1, 0].set_title("Decision Tree Confusion Matrix")

sns.heatmap(confusion_matrix(y_test, predicted_rf), annot=True, fmt='d', cmap='Purples', ax=axs[1, 1])
axs[1, 1].set_title("Random Forest Confusion Matrix")

plt.show()
No description has been provided for this image
In [ ]: