K-MEANS CLUSTERING

 K-Means clustering is an unsupervised centroid based algorithm. The algorithm tends to reduce the distance between the points in a cluster and the cluster centroid.

The dataset I used is seeds dataset from : https://archive.ics.uci.edu/ml/datasets/seeds


import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

#loading dataset

df = pd.read_csv('seeds_dataset.csv')

df.head()



#taking compactness and perimeter columns

z= df.iloc[:,[2,3]].values

#applying elbow method to find the maximum number of clusters

from sklearn.cluster import KMeans  

elbow_list= [] 

for i in range(1, 11):  

    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)  

    kmeans.fit(z)  

    elbow_list.append(kmeans.inertia_)  

plt.plot(range(1, 11), elbow_list)  

plt.title('The Elbow Method Graph')  

plt.xlabel('Number of clusters(k)')  

plt.ylabel('elbow_list')  

plt.show()


#from the elbow method we observe number of maximum clusters that we can take for algorithm is 5

kmeans = KMeans(n_clusters=5, init='k-means++', random_state= 42)  

y_predict= kmeans.fit_predict(z)  

#Visualization of the k-means clustering

plt.scatter(z[y_predict == 0, 0], z[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1')   

plt.scatter(z[y_predict == 1, 0], z[y_predict == 1, 1], s = 100, c = 'red', label = 'Cluster 2')   

plt.scatter(z[y_predict == 2, 0], z[y_predict == 2, 1], s = 100, c = 'green', label = 'Cluster 3')   

plt.scatter(z[y_predict == 3, 0], z[y_predict == 3, 1], s = 100, c = 'pink', label = 'Cluster 4')   

plt.scatter(z[y_predict == 4, 0], z[y_predict == 4, 1], s = 100, c = 'black', label = 'Cluster 5')   

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')   

plt.yticks(np.arange(0.8,1,0.08))

plt.xticks(np.arange(11,17,1))

plt.title('Clusters of seeds based on perimeter and compactness')  

plt.xlabel('Compactness')  

plt.ylabel('Perimeter')  

plt.legend()  

plt.show()  




Comments