!pip install scikit-learn

Requirement already satisfied: scikit-learn in /opt/anaconda3/lib/python3.12/site-packages (1.4.2)
Requirement already satisfied: numpy>=1.19.5 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.26.4)
Requirement already satisfied: scipy>=1.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from scikit-learn) (2.2.0)

import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

water_df = pd.read_csv('water_potability.csv')

water_df.head()

water_df.columns

Index(['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate', 'Conductivity',
       'Organic_carbon', 'Trihalomethanes', 'Turbidity', 'Potability'],
      dtype='object')

water_df.isna().sum()

ph                 491
Hardness             0
Solids               0
Chloramines          0
Sulfate            781
Conductivity         0
Organic_carbon       0
Trihalomethanes    162
Turbidity            0
Potability           0
dtype: int64

w_cleaned_df = water_df

w_cleaned_df['ph'] = water_df['ph'].fillna(water_df['ph'].mean())
w_cleaned_df['Sulfate'] = water_df['Sulfate'].fillna(water_df['Sulfate'].mean())
w_cleaned_df['Trihalomethanes'] = water_df['Trihalomethanes'].fillna(water_df['Trihalomethanes'].mean())

w_cleaned_df.isna().sum()

ph                 0
Hardness           0
Solids             0
Chloramines        0
Sulfate            0
Conductivity       0
Organic_carbon     0
Trihalomethanes    0
Turbidity          0
Potability         0
dtype: int64

x = w_cleaned_df.drop(['Potability'], axis = 1)
y = w_cleaned_df['Potability']

x.shape, y.shape

((3276, 9), (3276,))

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= .3, random_state=42)

x_train.shape

(2293, 9)

from sklearn import tree

clf = tree.DecisionTreeClassifier(max_depth = 5)
clf = clf.fit(x_train, y_train)
predicted_tree = clf.predict(x_test)

clf.score(x_test, y_test)

0.6510681586978637

y.value_counts()

Potability
0    1998
1    1278
Name: count, dtype: int64

fig, axes = plt.subplots(nrows = 1,ncols = 1, figsize = (48,12), dpi=300)
tree.plot_tree(clf, feature_names = x.columns, filled=True)
plt.show()

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42)

rf.fit(x_train, y_train)
predicted_rf = rf.predict(x_test)

score = rf.score(x_test, y_test)
print(score)

0.6795523906408952

from sklearn.metrics import accuracy_score, classification_report, f1_score

from sklearn.preprocessing import StandardScaler

scaler= StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)

x_test_scaled = scaler.transform(x_test)

from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

GaussianNB()

GaussianNB()

predicted_nb = nb_model.predict(x_test)

from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(x_train, y_train)

KNeighborsClassifier()

KNeighborsClassifier()

predicted_knn = knn_model.predict(x_test)

print('Decision Tree:', clf.score(x_test, y_test))
print('Random Forest:', score)
print('Naive Bayes:', accuracy_score(y_test, predicted_nb))
print('KNN:', accuracy_score(y_test, predicted_knn))

Decision Tree: 0.6510681586978637
Random Forest: 0.6795523906408952
Naive Bayes: 0.6358087487283826
KNN: 0.5483214649033571

print('Decision Tree:', f1_score(y_test,predicted_tree))
print('Random Forest:', f1_score(y_test,predicted_rf))
print('Naive Bayes:', f1_score(y_test,predicted_nb))
print('KNN:', f1_score(y_test,predicted_knn))

Decision Tree: 0.2809224318658281
Random Forest: 0.4844517184942717
Naive Bayes: 0.3035019455252918
KNN: 0.3373134328358209

from sklearn.metrics import confusion_matrix

print('Decision Tree:', confusion_matrix(y_test,predicted_tree))
print('Random Forest:', confusion_matrix(y_test,predicted_rf))
print('Naive Bayes:', confusion_matrix(y_test,predicted_nb))
print('KNN:', confusion_matrix(y_test,predicted_knn))

Decision Tree: [[573  44]
 [299  67]]
Random Forest: [[520  97]
 [218 148]]
Naive Bayes: [[547  70]
 [288  78]]
KNN: [[426 191]
 [253 113]]

accuracy_scores = {'Naive Bayes': 0.64,'KNN': 0.55,'Decision Tree': 0.65,
    'Random Forest': 0.68}

plt.figure(figsize=(8, 5))
plt.bar(accuracy_scores.keys(), accuracy_scores.values(), color=['blue', 'green', 'orange', 'purple'])
plt.title('Accuracy Comparison Between Models')
plt.xlabel('Models')
plt.ylabel('Accuracy Score')
plt.ylim(0, 1)
plt.show()

f1_scores = {'Naive Bayes': 0.64,  'KNN': 0.55,'Decision Tree': 0.65,
    'Random Forest': 0.68}
plt.figure(figsize=(8, 5))
plt.bar(f1_scores.keys(), f1_scores.values(), color=['blue', 'green', 'orange', 'purple'])
plt.title('F1-Score Comparison Between Models')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.ylim(0, 1)
plt.show()

fig, axs = plt.subplots(2, 2, figsize=(12, 10))

sns.heatmap(confusion_matrix(y_test, predicted_nb), annot=True, fmt='d', cmap='Blues', ax=axs[0, 0])
axs[0, 0].set_title("Naive Bayes Confusion Matrix")

sns.heatmap(confusion_matrix(y_test, predicted_knn), annot=True, fmt='d', cmap='Greens', ax=axs[0, 1])
axs[0, 1].set_title("KNN Confusion Matrix")

sns.heatmap(confusion_matrix(y_test, predicted_tree), annot=True, fmt='d', cmap='Oranges', ax=axs[1, 0])
axs[1, 0].set_title("Decision Tree Confusion Matrix")

sns.heatmap(confusion_matrix(y_test, predicted_rf), annot=True, fmt='d', cmap='Purples', ax=axs[1, 1])
axs[1, 1].set_title("Random Forest Confusion Matrix")

plt.show()

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity
0	NaN	204.890455	20791.318981	7.300212	368.516441	564.308654	10.379783	86.990970	2.963135
1	3.716080	129.422921	18630.057858	6.635246	NaN	592.885359	15.180013	56.329076	4.500656
2	8.099124	224.236259	19909.541732	9.275884	NaN	418.606213	16.868637	66.420093	3.055934
3	8.316766	214.373394	22018.417441	8.059332	356.886136	363.266516	18.436524	100.341674	4.628771
4	9.092223	181.101509	17978.986339	6.546600	310.135738	398.410813	11.558279	31.997993	4.075075