Find The "elbow Point" On An Optimization Curve With Python
i have a list of points which are the inertia values of a kmeans algorithm. To determine the optimum amount of clusters i need to find the point, where this curve starts to flatten
Solution 1:
I worked on a Python package modeled after the Kneedle algorithm. It finds x=5
as the point where the curve starts to flatten. The documentation and the paper discuss the algorithm for choosing the knee point in more detail.
y= [7342.1301373073857, 6881.7109460930769, 6531.1657905495022,
6356.2255554679778, 6209.8382535595829, 6094.9052166741121,
5980.0191582610196, 5880.1869867848218, 5779.8957906367368,
5691.1879324562778, 5617.5153566271356, 5532.2613232619951,
5467.352265375117, 5395.4493783888756, 5345.3459908298091,
5290.6769823693812, 5243.5271656371888, 5207.2501206569532,
5164.9617535255456]
x=range(1,len(y)+1)fromkneedimportKneeLocatorkn=KneeLocator(x,y,curve='convex',direction='decreasing')print(kn.knee)5importmatplotlib.pyplotaspltplt.xlabel('numberofclustersk')plt.ylabel('Sumofsquareddistances')plt.plot(x,y,'bx-')plt.vlines(kn.knee,plt.ylim()[0],plt.ylim()[1],linestyles='dashed')
Solution 2:
For all those who want to do this on their own, here is a little and basic implementation. It is highly adapted to my use case (200 clusters as border for the calculation) and the calculation of the distance is very basic and based to point->point in a 2D space, but it can be adapted to any other amount of figures. I think Kevin's library is technically more up to date and better implemented.
import KMeansClusterer
from math import sqrt, fabs
from matplotlib import pyplot as plp
import multiprocessing as mp
import numpy as np
classClusterCalculator:
m = 0
b = 0
sum_squared_dist = []
derivates = []
distances = []
line_coordinates = []
def__init__(self, calc_border, data):
self.calc_border = calc_border
self.data = data
defcalculate_optimum_clusters(self, option_parser):
if(option_parser.multiProcessing):
self.calc_mp()
else:
self.calculate_squared_dist()
self.init_opt_line()
self.calc_distances()
self.calc_line_coordinates()
opt_clusters = self.get_optimum_clusters()
print("Evaluated", opt_clusters, "as optimum number of clusters")
self.plot_results()
return opt_clusters
defcalculate_squared_dist(self):
for k inrange(1, self.calc_border):
print("Calculating",k, "of", self.calc_border, "\n", (self.calc_border - k), "to go!")
kmeans = KMeansClusterer.KMeansClusterer(k, self.data)
ine = kmeans.calc_custom_params(self.data, k).inertia_
print("inertia in round", k, ": ", ine)
self.sum_squared_dist.append(ine)
definit_opt_line(self):
self. m = (self.sum_squared_dist[0] - self.sum_squared_dist[-1]) / (1 - self.calc_border)
self.b = (1 * self.sum_squared_dist[0] - self.calc_border*self.sum_squared_dist[0]) / (1 - self.calc_border)
defcalc_y_value(self, x_calc):
return self.m * x_calc + self.b
defcalc_line_coordinates(self):
for i inrange(0, len(self.sum_squared_dist)):
self.line_coordinates.append(self.calc_y_value(i))
defcalc_distances(self):
for i inrange(0, self.calc_border):
y_value = self.calc_y_value(i)
d = sqrt(fabs(self.sum_squared_dist[i] - self.calc_y_value(i)))
length_list = len(self.sum_squared_dist)
self.distances.append(sqrt(fabs(self.sum_squared_dist[i] - self.calc_y_value(i))))
print("For border", self.calc_border, ", calculated the following distances: \n", self.distances)
defget_optimum_clusters(self):
return self.distances.index((max(self.distances)))
defplot_results(self):
plp.plot(range(0, self.calc_border), self.sum_squared_dist, "bx-")
plp.plot(range(0, self.calc_border), self.line_coordinates, "bx-")
plp.xlabel("Number of clusters")
plp.ylabel("Sum of squared distances")
plp.show()
defcalculate_squared_dist_sliced_data(self,output, proc_numb, start, end):
temp = []
for k inrange(start, end + 1):
kmeans = KMeansClusterer.KMeansClusterer(k, self.data)
ine = kmeans.calc_custom_params(self.data, k).inertia_
print("Process", proc_numb,"had the CPU,", "calculated", ine, "in round", k)
temp.append(ine)
output.put((proc_numb, temp))
defsort_result_queue(self, result):
result.sort()
result = [r[1] for r in result]
flat_list= [item for sl in result for item in sl]
return flat_list
defcalc_mp(self):
output = mp.Queue()
processes = []
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 1, 1, 50)))
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 2, 51, 100)))
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 3, 101, 150)))
processes.append(mp.Process(target=self.calculate_squared_dist_sliced_data, args=(output, 4, 151, 200)))
for p in processes:
p.start()
#lock code and wait for all processes to finsishfor p in processes:
p.join()
results = [output.get() for p in processes]
self.sum_squared_dist = self.sort_result_queue(results)
Post a Comment for "Find The "elbow Point" On An Optimization Curve With Python"