Source code for pyradar.classifiers.isodata

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2012 - 2013
# Matías Herranz <matiasherranz@gmail.com>
# Joaquín Tita <joaquintita@gmail.com>
#
# https://github.com/PyRadar/pyradar
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 3 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library. If not, see <http://www.gnu.org/licenses/>.


import numpy as np
from scipy.cluster import vq

#from pyradar.utils import take_snapshot


[docs]def initialize_parameters(parameters=None): """Auxiliar function to set default values to all the parameters not given a value by the user. """ parameters = {} if not parameters else parameters def safe_pull_value(parameters, key, default): return parameters.get(key, default) # number of clusters desired K = safe_pull_value(parameters, 'K', 5) # maximum number of iterations I = safe_pull_value(parameters, 'I', 100) # maximum of number of pairs of clusters which can be merged P = safe_pull_value(parameters, 'P', 4) # threshold value for minimum number of samples in each cluster # (discarding clusters) THETA_M = safe_pull_value(parameters, 'THETA_M', 10) # threshold value for standard deviation (for split) THETA_S = safe_pull_value(parameters, 'THETA_S', 1) # threshold value for pairwise distances (for merge) THETA_C = safe_pull_value(parameters, 'THETA_C', 20) # percentage of change in clusters between each iteration #(to stop algorithm) THETA_O = 0.05 #can use any of both fixed or random # number of starting clusters #k = np.random.randint(1, K) k = safe_pull_value(parameters, 'k', K) ret = locals() ret.pop('safe_pull_value') ret.pop('parameters') globals().update(ret)
[docs]def quit_low_change_in_clusters(centers, last_centers, iter): """Stop algorithm by low change in the clusters values between each iteration. :returns: True if should stop, otherwise False. """ quit = False if centers.shape == last_centers.shape: thresholds = np.abs((centers - last_centers) / (last_centers + 1)) if np.all(thresholds <= THETA_O): # percent of change in [0:1] quit = True # print "Isodata(info): Stopped by low threshold at the centers." # print "Iteration step: %s" % iter return quit
[docs]def merge_clusters(img_class_flat, centers, clusters_list): """ Merge by pair of clusters in 'below_threshold' to form new clusters. """ pair_dists = compute_pairwise_distances(centers) first_p_elements = pair_dists[:P] below_threshold = [(c1, c2) for d, (c1, c2) in first_p_elements if d < THETA_C] if below_threshold: k = centers.size count_per_cluster = np.zeros(k) to_add = np.array([]) # new clusters to add to_delete = np.array([]) # clusters to delete for cluster in xrange(0, k): result = np.where(img_class_flat == clusters_list[cluster]) indices = result[0] count_per_cluster[cluster] = indices.size for c1, c2 in below_threshold: c1_count = float(count_per_cluster[c1]) + 1 c2_count = float(count_per_cluster[c2]) factor = 1.0 / (c1_count + c2_count) weight_c1 = c1_count * centers[c1] weight_c2 = c2_count * centers[c2] value = round(factor * (weight_c1 + weight_c2)) to_add = np.append(to_add, value) to_delete = np.append(to_delete, [c1, c2]) #delete old clusters and their indices from the availables array centers = np.delete(centers, to_delete) clusters_list = np.delete(clusters_list, to_delete) #generate new indices for the new clusters #starting from the max index 'to_add.size' times start = int(clusters_list.max()) end = to_add.size + start centers = np.append(centers, to_add) clusters_list = np.append(clusters_list, xrange(start, end)) centers, clusters_list = sort_arrays_by_first(centers, clusters_list) return centers, clusters_list
[docs]def compute_pairwise_distances(centers): """ Compute the pairwise distances 'pair_dists', between every two clusters centers and returns them sorted. Returns: - a list with tuples, where every tuple has in it's first coord the distance between to clusters, and in the second coord has a tuple, with the numbers of the clusters measured. Output example: [(d1,(cluster_1,cluster_2)), (d2,(cluster_3,cluster_4)), ... (dn, (cluster_n,cluster_n+1))] """ pair_dists = [] size = centers.size for i in xrange(0, size): for j in xrange(0, size): if i > j: d = np.abs(centers[i] - centers[j]) pair_dists.append((d, (i, j))) #return it sorted on the first elem return sorted(pair_dists)
[docs]def split_clusters(img_flat, img_class_flat, centers, clusters_list): """ Split clusters to form new clusters. """ assert centers.size == clusters_list.size, \ "ERROR: split() centers and clusters_list size are different" delta = 10 k = centers.size count_per_cluster = np.zeros(k) stddev = np.array([]) avg_dists_to_clusters = compute_avg_distance(img_flat, img_class_flat, centers, clusters_list) d = compute_overall_distance(img_class_flat, avg_dists_to_clusters, clusters_list) # compute all the standard deviation of the clusters for cluster in xrange(0, k): indices = np.where(img_class_flat == clusters_list[cluster])[0] count_per_cluster[cluster] = indices.size value = ((img_flat[indices] - centers[cluster]) ** 2).sum() value /= count_per_cluster[cluster] value = np.sqrt(value) stddev = np.append(stddev, value) cluster = stddev.argmax() max_stddev = stddev[cluster] max_clusters_list = int(clusters_list.max()) if max_stddev > THETA_S: if avg_dists_to_clusters[cluster] >= d: if count_per_cluster[cluster] > (2.0 * THETA_M): old_cluster = centers[cluster] new_cluster_1 = old_cluster + delta new_cluster_2 = old_cluster - delta centers = np.delete(centers, cluster) clusters_list = np.delete(clusters_list, cluster) centers = np.append(centers, [new_cluster_1, new_cluster_2]) clusters_list = np.append(clusters_list, [max_clusters_list, (max_clusters_list + 1)]) centers, clusters_list = sort_arrays_by_first(centers, clusters_list) assert centers.size == clusters_list.size, \ "ERROR: split() centers and clusters_list size are different" return centers, clusters_list
[docs]def compute_overall_distance(img_class_flat, avg_dists_to_clusters, clusters_list): """ Computes the overall distance of the samples from their respective cluster centers. """ k = avg_dists_to_clusters.size total = img_class_flat.size count_per_cluster = np.zeros(k) for cluster in xrange(0, k): indices = np.where(img_class_flat == clusters_list[cluster])[0] count_per_cluster[cluster] = indices.size d = ((count_per_cluster / total) * avg_dists_to_clusters).sum() return d
[docs]def compute_avg_distance(img_flat, img_class_flat, centers, clusters_list): """ Computes all the average distances to the center in each cluster. """ k = centers.size avg_dists_to_clusters = np.array([]) for cluster in xrange(0, k): indices = np.where(img_class_flat == clusters_list[cluster])[0] total_per_cluster = indices.size + 1 sum_per_cluster = (np.abs(img_flat[indices] - centers[cluster])).sum() dj = (sum_per_cluster / float(total_per_cluster)) avg_dists_to_clusters = np.append(avg_dists_to_clusters, dj) return avg_dists_to_clusters
[docs]def discard_clusters(img_class_flat, centers, clusters_list): """ Discard clusters with fewer than THETA_M. """ k = centers.shape[0] to_delete = np.array([]) assert centers.size == clusters_list.size, \ "ERROR: discard_cluster() centers and clusters_list size are different" for cluster in xrange(0, k): indices = np.where(img_class_flat == clusters_list[cluster])[0] total_per_cluster = indices.size if total_per_cluster <= THETA_M: to_delete = np.append(to_delete, cluster) if to_delete.size: new_centers = np.delete(centers, to_delete) new_clusters_list = np.delete(clusters_list, to_delete) else: new_centers = centers new_clusters_list = clusters_list new_centers, new_clusters_list = sort_arrays_by_first(new_centers, new_clusters_list) # shape_bef = centers.shape[0] # shape_aft = new_centers.shape[0] # print "Isodata(info): Discarded %s clusters." % (shape_bef - shape_aft) # if to_delete.size: # print "Clusters discarded %s" % to_delete assert new_centers.size == new_clusters_list.size, \ "ERROR: discard_cluster() centers and clusters_list size are different" return new_centers, new_clusters_list
[docs]def update_clusters(img_flat, img_class_flat, centers, clusters_list): """ Update clusters. """ k = centers.shape[0] new_centers = np.array([]) new_clusters_list = np.array([]) assert centers.size == clusters_list.size, \ "ERROR: update_clusters() centers and clusters_list size are different" for cluster in xrange(0, k): indices = np.where(img_class_flat == clusters_list[cluster])[0] #get whole cluster cluster_values = img_flat[indices] #sum and count the values sum_per_cluster = cluster_values.sum() total_per_cluster = (cluster_values.size) + 1 #compute the new center of the cluster new_cluster = sum_per_cluster / total_per_cluster new_centers = np.append(new_centers, new_cluster) new_clusters_list = np.append(new_clusters_list, cluster) new_centers, new_clusters_list = sort_arrays_by_first(new_centers, new_clusters_list) assert new_centers.size == new_clusters_list.size, \ "ERROR: update_clusters() centers and clusters_list size are different" return new_centers, new_clusters_list
[docs]def initial_clusters(img_flat, k, method="linspace"): """ Define initial clusters centers as startup. By default, the method is "linspace". Other method available is "random". """ methods_availables = ["linspace", "random"] assert method in methods_availables, "ERROR: method %s is no valid." \ "Methods availables %s" \ % (method, methods_availables) if method == "linspace": max, min = img_flat.max(), img_flat.min() centers = np.linspace(min, max, k) elif method == "random": start, end = 0, img_flat.size indices = np.random.randint(start, end, k) centers = img_flat.take(indices) return centers
[docs]def sort_arrays_by_first(centers, clusters_list): """ Sort the array 'centers' and the with indices of the sorted centers order the array 'clusters_list'. Example: centers=[22, 33, 0, 11] and cluster_list=[7,6,5,4] returns (array([ 0, 11, 22, 33]), array([5, 4, 7, 6])) """ assert centers.size == clusters_list.size, \ "ERROR: sort_arrays_by_first centers and clusters_list size are not equal" indices = np.argsort(centers) sorted_centers = centers[indices] sorted_clusters_list = clusters_list[indices] return sorted_centers, sorted_clusters_list
[docs]def isodata_classification(img, parameters=None): """ Classify a numpy 'img' using Isodata algorithm. Parameters: a dictionary with the following keys. - img: an input numpy array that contains the image to classify. - parameters: a dictionary with the initial values. If 'parameters' are not specified, the algorithm uses the default ones. + number of clusters desired. K = 15 + max number of iterations. I = 100 + max number of pairs of clusters which can be ,erged. P = 2 + threshold value for min number in each cluster. THETA_M = 10 + threshold value for standard deviation (for split). THETA_S = 0.1 + threshold value for pairwise distances (for merge). THETA_C = 2 + threshold change in the clusters between each iter. THETA_O = 0.01 Note: if some(or all) parameters are nos providen, default values will be used. Returns: - img_class: a numpy array with the classification. """ global K, I, P, THETA_M, THETA_S, THEHTA_C, THETA_O, k initialize_parameters(parameters) N, M = img.shape # for reshaping at the end img_flat = img.flatten() clusters_list = np.arange(k) # number of clusters availables print "Isodata(info): Starting algorithm with %s classes" % k centers = initial_clusters(img_flat, k, "linspace") for iter in xrange(0, I): # print "Isodata(info): Iteration:%s Num Clusters:%s" % (iter, k) last_centers = centers.copy() # assing each of the samples to the closest cluster center img_class_flat, dists = vq.vq(img_flat, centers) centers, clusters_list = discard_clusters(img_class_flat, centers, clusters_list) centers, clusters_list = update_clusters(img_flat, img_class_flat, centers, clusters_list) k = centers.size if k <= (K / 2.0): # too few clusters => split clusters centers, clusters_list = split_clusters(img_flat, img_class_flat, centers, clusters_list) elif k > (K * 2.0): # too many clusters => merge clusters centers, clusters_list = merge_clusters(img_class_flat, centers, clusters_list) else: # nor split or merge are needed pass k = centers.size ############################################################################### if quit_low_change_in_clusters(centers, last_centers, iter): break # take_snapshot(img_class_flat.reshape(N, M), iteration_step=iter) ############################################################################### print "Isodata(info): Finished with %s classes" % k print "Isodata(info): Number of Iterations: %s" % (iter + 1) return img_class_flat.reshape(N, M)
Read the Docs v: latest
Versions
latest
Downloads
PDF
HTML
Epub
On Read the Docs
Project Home
Builds

Free document hosting provided by Read the Docs.