K-MEANS CLUSTERING
K-Means clustering is an unsupervised centroid based algorithm. The algorithm tends to reduce the distance between the points in a cluster and the cluster centroid.
The dataset I used is seeds dataset from : https://archive.ics.uci.edu/ml/datasets/seeds
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#loading dataset
df = pd.read_csv('seeds_dataset.csv')
df.head()
#taking compactness and perimeter columns
z= df.iloc[:,[2,3]].values
#applying elbow method to find the maximum number of clusters
from sklearn.cluster import KMeans
elbow_list= []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)
kmeans.fit(z)
elbow_list.append(kmeans.inertia_)
plt.plot(range(1, 11), elbow_list)
plt.title('The Elbow Method Graph')
plt.xlabel('Number of clusters(k)')
plt.ylabel('elbow_list')
plt.show()
#from the elbow method we observe number of maximum clusters that we can take for algorithm is 5
kmeans = KMeans(n_clusters=5, init='k-means++', random_state= 42)
y_predict= kmeans.fit_predict(z)
#Visualization of the k-means clustering
plt.scatter(z[y_predict == 0, 0], z[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1')
plt.scatter(z[y_predict == 1, 0], z[y_predict == 1, 1], s = 100, c = 'red', label = 'Cluster 2')
plt.scatter(z[y_predict == 2, 0], z[y_predict == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(z[y_predict == 3, 0], z[y_predict == 3, 1], s = 100, c = 'pink', label = 'Cluster 4')
plt.scatter(z[y_predict == 4, 0], z[y_predict == 4, 1], s = 100, c = 'black', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')
plt.yticks(np.arange(0.8,1,0.08))
plt.xticks(np.arange(11,17,1))
plt.title('Clusters of seeds based on perimeter and compactness')
plt.xlabel('Compactness')
plt.ylabel('Perimeter')
plt.legend()
plt.show()



Comments
Post a Comment