"""
A module of classification schemes for choropleth mapping.
"""
import numpy as np
import scipy.stats as stats
import copy
from sklearn.cluster import KMeans as KMEANS
from warnings import warn as Warn
__author__ = "Sergio J. Rey"
__all__ = [
"MapClassifier",
"quantile",
"BoxPlot",
"EqualInterval",
"FisherJenks",
"FisherJenksSampled",
"JenksCaspall",
"JenksCaspallForced",
"JenksCaspallSampled",
"HeadTailBreaks",
"MaxP",
"MaximumBreaks",
"NaturalBreaks",
"Quantiles",
"Percentiles",
"StdMean",
"UserDefined",
"gadf",
"KClassifiers",
"CLASSIFIERS",
]
CLASSIFIERS = (
"BoxPlot",
"EqualInterval",
"FisherJenks",
"FisherJenksSampled",
"HeadTailBreaks",
"JenksCaspall",
"JenksCaspallForced",
"JenksCaspallSampled",
"MaxP",
"MaximumBreaks",
"NaturalBreaks",
"Quantiles",
"Percentiles",
"StdMean",
"UserDefined",
)
K = 5 # default number of classes in any map scheme with this as an argument
SEEDRANGE = 1000000 # range for drawing random ints from for Natural Breaks
FMT = "{:.2f}"
try:
from numba import jit
except ImportError:
def jit(func):
return func
def _format_intervals(mc, fmt="{:.0f}"):
"""
Helper methods to format legend intervals
Parameters
----------
mc: MapClassifier
fmt: str
specification of formatting for legend
Returns
-------
tuple:
edges: list
k strings for class intervals
max_width: int
length of largest interval string
lower_open: boolean
True: lower bound of first interval is open
False: lower bound of first interval is closed
Notes
-----
For some classifiers, it is possible that the upper bound of the first interval is less than the minimum value of the attribute that is being classified. In these cases `lower_open=True` and the lower bound of the interval is set to `np.NINF`.
"""
lowest = mc.y.min()
lower_open = False
if lowest > mc.bins[0]:
lowest = np.NINF
lower_open = True
edges = [lowest]
edges.extend(mc.bins)
edges = [fmt.format(edge) for edge in edges]
max_width = max([len(edge) for edge in edges])
return edges, max_width, lower_open
def _get_mpl_labels(mc, fmt="{:.1f}"):
"""
Helper method to format legend intervals for matplotlib (and geopandas)
Parameters
----------
mc: MapClassifier
fmt: str
specification of formatting for legend
Returns
-------
intervals: list
k strings for class intervals
"""
edges, max_width, lower_open = _format_intervals(mc, fmt)
k = len(edges) - 1
left = ["["]
if lower_open:
left = ["("]
left.extend("(" * k)
right = "]" * (k + 1)
lower = ["{:>{width}}".format(edges[i], width=max_width) for i in range(k)]
upper = ["{:>{width}}".format(edges[i], width=max_width) for i in range(1, k + 1)]
lower = [l + r for l, r in zip(left, lower)]
upper = [l + r for l, r in zip(upper, right)]
intervals = [l + ", " + r for l, r in zip(lower, upper)]
return intervals
def _get_table(mc, fmt="{:.2f}"):
"""
Helper function to generate tabular classification report
Parameters
----------
mc: MapClassifier
fmt: str
specification of formatting for legend
Returns
-------
table: string
formatted table of classification results
"""
intervals = _get_mpl_labels(mc, fmt)
interval_width = len(intervals[0])
counts = list(map(str, mc.counts))
count_width = max([len(count) for count in counts])
count_width = max(count_width, len("count"))
interval_width = max(interval_width, len("interval"))
header = "{:^{width}}".format("Interval", width=interval_width)
header += " " + "{:>{width}}".format("Count", width=count_width)
title = "{:<{width}}".format(mc.name, width=len(header))
header += "\n" + "-" * len(header)
table = [title, "", header]
for i, interval in enumerate(intervals):
row = interval + " | " + "{:>{width}}".format(counts[i], width=count_width)
table.append(row)
return "\n".join(table)
def head_tail_breaks(values, cuts):
"""
head tail breaks helper function
"""
values = np.array(values)
mean = np.mean(values)
cuts.append(mean)
if len(set(values)) > 1:
return head_tail_breaks(values[values > mean], cuts)
return cuts
def quantile(y, k=4):
"""
Calculates the quantiles for an array
Parameters
----------
y : array
(n,1), values to classify
k : int
number of quantiles
Returns
-------
q : array
(n,1), quantile values
Examples
--------
>>> import numpy as np
>>> import mapclassify as mc
>>> x = np.arange(1000)
>>> mc.classifiers.quantile(x)
array([249.75, 499.5 , 749.25, 999. ])
>>> mc.classifiers.quantile(x, k = 3)
array([333., 666., 999.])
Note that if there are enough ties that the quantile values repeat, we
collapse to pseudo quantiles in which case the number of classes will be
less than k
>>> x = [1.0] * 100
>>> x.extend([3.0] * 40)
>>> len(x)
140
>>> y = np.array(x)
>>> mc.classifiers.quantile(y)
array([1., 3.])
"""
w = 100.0 / k
p = np.arange(w, 100 + w, w)
if p[-1] > 100.0:
p[-1] = 100.0
q = np.array([stats.scoreatpercentile(y, pct) for pct in p])
q = np.unique(q)
k_q = len(q)
if k_q < k:
Warn(
"Warning: Not enough unique values in array to form k classes", UserWarning
)
Warn("Warning: setting k to %d" % k_q, UserWarning)
return q
def binC(y, bins):
"""
Bin categorical/qualitative data
Parameters
----------
y : array
(n,q), categorical values
bins : array
(k,1), unique values associated with each bin
Return
------
b : array
(n,q), bin membership, values between 0 and k-1
Examples
--------
>>> import numpy as np
>>> import mapclassify as mc
>>> np.random.seed(1)
>>> x = np.random.randint(2, 8, (10, 3))
>>> bins = list(range(2, 8))
>>> x
array([[7, 5, 6],
[2, 3, 5],
[7, 2, 2],
[3, 6, 7],
[6, 3, 4],
[6, 7, 4],
[6, 5, 6],
[4, 6, 7],
[4, 6, 3],
[3, 2, 7]])
>>> y = mc.classifiers.binC(x, bins)
>>> y
array([[5, 3, 4],
[0, 1, 3],
[5, 0, 0],
[1, 4, 5],
[4, 1, 2],
[4, 5, 2],
[4, 3, 4],
[2, 4, 5],
[2, 4, 1],
[1, 0, 5]])
"""
if np.ndim(y) == 1:
k = 1
n = np.shape(y)[0]
else:
n, k = np.shape(y)
b = np.zeros((n, k), dtype="int")
for i, bin in enumerate(bins):
b[np.nonzero(y == bin)] = i
# check for non-binned items and warn if needed
vals = set(y.flatten())
for val in vals:
if val not in bins:
Warn("value not in bin: {}".format(val), UserWarning)
Warn("bins: {}".format(bins), UserWarning)
return b
def bin(y, bins):
"""
bin interval/ratio data
Parameters
----------
y : array
(n,q), values to bin
bins : array
(k,1), upper bounds of each bin (monotonic)
Returns
-------
b : array
(n,q), values of values between 0 and k-1
Examples
--------
>>> import numpy as np
>>> import mapclassify as mc
>>> np.random.seed(1)
>>> x = np.random.randint(2, 20, (10, 3))
>>> bins = [10, 15, 20]
>>> b = mc.classifiers.bin(x, bins)
>>> x
array([[ 7, 13, 14],
[10, 11, 13],
[ 7, 17, 2],
[18, 3, 14],
[ 9, 15, 8],
[ 7, 13, 12],
[16, 6, 11],
[19, 2, 15],
[11, 11, 9],
[ 3, 2, 19]])
>>> b
array([[0, 1, 1],
[0, 1, 1],
[0, 2, 0],
[2, 0, 1],
[0, 1, 0],
[0, 1, 1],
[2, 0, 1],
[2, 0, 1],
[1, 1, 0],
[0, 0, 2]])
"""
if np.ndim(y) == 1:
k = 1
n = np.shape(y)[0]
else:
n, k = np.shape(y)
b = np.zeros((n, k), dtype="int")
i = len(bins)
if type(bins) != list:
bins = bins.tolist()
binsc = copy.copy(bins)
while binsc:
i -= 1
c = binsc.pop(-1)
b[np.nonzero(y <= c)] = i
return b
def bin1d(x, bins):
"""
Place values of a 1-d array into bins and determine counts of values in
each bin
Parameters
----------
x : array
(n, 1), values to bin
bins : array
(k,1), upper bounds of each bin (monotonic)
Returns
-------
binIds : array
1-d array of integer bin Ids
counts : int
number of elements of x falling in each bin
Examples
--------
>>> import numpy as np
>>> import mapclassify as mc
>>> x = np.arange(100, dtype = 'float')
>>> bins = [25, 74, 100]
>>> binIds, counts = mc.classifiers.bin1d(x, bins)
>>> binIds
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
>>> counts
array([26, 49, 25])
"""
left = [-float("inf")]
left.extend(bins[0:-1])
right = bins
cuts = list(zip(left, right))
k = len(bins)
binIds = np.zeros(x.shape, dtype="int")
while cuts:
k -= 1
l, r = cuts.pop(-1)
binIds += (x > l) * (x <= r) * k
counts = np.bincount(binIds, minlength=len(bins))
return (binIds, counts)
def load_example():
"""
Helper function for doc tests
"""
from .datasets import calemp
return calemp.load()
def _kmeans(y, k=5, n_init=10):
"""
Helper function to do k-means in one dimension
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes to form
n_init : int, default: 10
number of initial solutions. Best of initial results is returned.
"""
y = y * 1.0 # KMEANS needs float or double dtype
y.shape = (-1, 1)
result = KMEANS(n_clusters=k, init="k-means++", n_init=n_init).fit(y)
class_ids = result.labels_
centroids = result.cluster_centers_
binning = []
for c in range(k):
values = y[class_ids == c]
binning.append([values.max(), len(values)])
binning = np.array(binning)
binning = binning[binning[:, 0].argsort()]
cuts = binning[:, 0]
y_cent = np.zeros_like(y)
for c in range(k):
y_cent[class_ids == c] = centroids[c]
diffs = y - y_cent
diffs *= diffs
return class_ids, cuts, diffs.sum(), centroids
def natural_breaks(values, k=5, init=10):
"""
natural breaks helper function
Jenks natural breaks is kmeans in one dimension
Parameters
----------
values : array
(n, 1) values to bin
k : int
Number of classes
init: int, default:10
Number of different solutions to obtain using different centroids. Best solution is returned.
"""
values = np.array(values)
uv = np.unique(values)
uvk = len(uv)
if uvk < k:
Warn(
"Warning: Not enough unique values in array to form k classes", UserWarning
)
Warn("Warning: setting k to %d" % uvk, UserWarning)
k = uvk
kres = _kmeans(values, k, n_init=init)
sids = kres[-1] # centroids
fit = kres[-2]
class_ids = kres[0]
cuts = kres[1]
return (sids, class_ids, fit, cuts)
@jit
def _fisher_jenks_means(values, classes=5, sort=True):
"""
Jenks Optimal (Natural Breaks) algorithm implemented in Python.
Notes
-----
The original Python code comes from here:
http://danieljlewis.org/2010/06/07/jenks-natural-breaks-algorithm-in-python/
and is based on a JAVA and Fortran code available here:
https://stat.ethz.ch/pipermail/r-sig-geo/2006-March/000811.html
Returns class breaks such that classes are internally homogeneous while
assuring heterogeneity among classes.
"""
if sort:
values.sort()
n_data = len(values)
mat1 = np.zeros((n_data + 1, classes + 1), dtype=np.int32)
mat2 = np.zeros((n_data + 1, classes + 1), dtype=np.float32)
mat1[1, 1:] = 1
mat2[2:, 1:] = np.inf
v = np.float32(0)
for l in range(2, len(values) + 1):
s1 = np.float32(0)
s2 = np.float32(0)
w = np.float32(0)
for m in range(1, l + 1):
i3 = l - m + 1
val = np.float32(values[i3 - 1])
s2 += val * val
s1 += val
w += np.float32(1)
v = s2 - (s1 * s1) / w
i4 = i3 - 1
if i4 != 0:
for j in range(2, classes + 1):
if mat2[l, j] >= (v + mat2[i4, j - 1]):
mat1[l, j] = i3
mat2[l, j] = v + mat2[i4, j - 1]
mat1[l, 1] = 1
mat2[l, 1] = v
k = len(values)
kclass = np.zeros(classes + 1, dtype=values.dtype)
kclass[classes] = values[len(values) - 1]
kclass[0] = values[0]
for countNum in range(classes, 1, -1):
pivot = mat1[k, countNum]
id = int(pivot - 2)
kclass[countNum - 1] = values[id]
k = int(pivot - 1)
return kclass
class MapClassifier(object):
r"""
Abstract class for all map classifications :cite:`Slocum_2009`
For an array :math:`y` of :math:`n` values, a map classifier places each
value :math:`y_i` into one of :math:`k` mutually exclusive and exhaustive
classes. Each classifer defines the classes based on different criteria,
but in all cases the following hold for the classifiers in PySAL:
.. math:: C_j^l < y_i \le C_j^u \ \forall i \in C_j
where :math:`C_j` denotes class :math:`j` which has lower bound
:math:`C_j^l` and upper bound :math:`C_j^u`.
Map Classifiers Supported
* :class:`mapclassify.classifiers.BoxPlot`
* :class:`mapclassify.classifiers.EqualInterval`
* :class:`mapclassify.classifiers.FisherJenks`
* :class:`mapclassify.classifiers.FisherJenksSampled`
* :class:`mapclassify.classifiers.HeadTailBreaks`
* :class:`mapclassify.classifiers.JenksCaspall`
* :class:`mapclassify.classifiers.JenksCaspallForced`
* :class:`mapclassify.classifiers.JenksCaspallSampled`
* :class:`mapclassify.classifiers.MaxP`
* :class:`mapclassify.classifiers.MaximumBreaks`
* :class:`mapclassify.classifiers.NaturalBreaks`
* :class:`mapclassify.classifiers.Quantiles`
* :class:`mapclassify.classifiers.Percentiles`
* :class:`mapclassify.classifiers.StdMean`
* :class:`mapclassify.classifiers.UserDefined`
Utilities:
In addition to the classifiers, there are several utility functions that
can be used to evaluate the properties of a specific classifier,
or for automatic selection of a classifier and
number of classes.
* :func:`mapclassify.classifiers.gadf`
* :class:`mapclassify.classifiers.K_classifiers`
"""
def __init__(self, y):
y = np.asarray(y).flatten()
self.name = "Map Classifier"
self.fmt = FMT
self.y = y
self._classify()
self._summary()
def get_fmt(self):
return self._fmt
def set_fmt(self, fmt):
self._fmt = fmt
fmt = property(get_fmt, set_fmt)
def _summary(self):
yb = self.yb
self.classes = [np.nonzero(yb == c)[0].tolist() for c in range(self.k)]
self.tss = self.get_tss()
self.adcm = self.get_adcm()
self.gadf = self.get_gadf()
def _classify(self):
if min(self.y) == max(self.y):
raise ValueError("Minimum and maximum of input data are equal, cannot create bins.")
else:
self._set_bins()
self.yb, self.counts = bin1d(self.y, self.bins)
def _update(self, data, *args, **kwargs):
"""
The only thing that *should* happen in this function is
1. input sanitization for pandas
2. classification/reclassification.
Using their __init__ methods, all classifiers can re-classify given
different input parameters or additional data.
If you've got a cleverer updating equation than the intial estimation
equation, remove the call to self.__init__ below and replace it with
the updating function.
"""
if data is not None:
data = np.asarray(data).flatten()
data = np.append(data.flatten(), self.y)
else:
data = self.y
self.__init__(data, *args, **kwargs)
@classmethod
def make(cls, *args, **kwargs):
"""
Configure and create a classifier that will consume data and produce
classifications, given the configuration options specified by this
function.
Note that this like a *partial application* of the relevant class
constructor. `make` creates a function that returns classifications; it
does not actually do the classification.
If you want to classify data directly, use the appropriate class
constructor, like Quantiles, Max_Breaks, etc.
If you *have* a classifier object, but want to find which bins new data
falls into, use find_bin.
Parameters
----------
*args : required positional arguments
all positional arguments required by the classifier,
excluding the input data.
rolling : bool
a boolean configuring the outputted classifier to use
a rolling classifier rather than a new classifier for
each input. If rolling, this adds the current data to
all of the previous data in the classifier, and
rebalances the bins, like a running median
computation.
return_object : bool
a boolean configuring the outputted classifier to
return the classifier object or not
return_bins : bool
a boolean configuring the outputted classifier to
return the bins/breaks or not
return_counts : bool
a boolean configuring the outputted classifier to
return the histogram of objects falling into each bin
or not
Returns
-------
A function that consumes data and returns their bins (and object,
bins/breaks, or counts, if requested).
Note
----
This is most useful when you want to run a classifier many times
with a given configuration, such as when classifying many columns of an
array or dataframe using the same configuration.
Examples
--------
>>> import libpysal as ps
>>> import mapclassify as mc
>>> import geopandas as gpd
>>> df = gpd.read_file(ps.examples.get_path('columbus.dbf'))
>>> classifier = mc.Quantiles.make(k=9)
>>> cl = df[['HOVAL', 'CRIME', 'INC']].apply(classifier)
>>> cl["HOVAL"].values[:10]
array([8, 7, 2, 4, 1, 3, 8, 5, 7, 8])
>>> cl["CRIME"].values[:10]
array([0, 1, 3, 4, 6, 2, 0, 5, 3, 4])
>>> cl["INC"].values[:10]
array([7, 8, 5, 0, 3, 5, 0, 3, 6, 4])
>>> import pandas as pd; from numpy import linspace as lsp
>>> data = [lsp(3,8,num=10), lsp(10, 0, num=10), lsp(-5, 15, num=10)]
>>> data = pd.DataFrame(data).T
>>> data
0 1 2
0 3.000000 10.000000 -5.000000
1 3.555556 8.888889 -2.777778
2 4.111111 7.777778 -0.555556
3 4.666667 6.666667 1.666667
4 5.222222 5.555556 3.888889
5 5.777778 4.444444 6.111111
6 6.333333 3.333333 8.333333
7 6.888889 2.222222 10.555556
8 7.444444 1.111111 12.777778
9 8.000000 0.000000 15.000000
>>> data.apply(mc.Quantiles.make(rolling=True))
0 1 2
0 0 4 0
1 0 4 0
2 1 4 0
3 1 3 0
4 2 2 1
5 2 1 2
6 3 0 4
7 3 0 4
8 4 0 4
9 4 0 4
>>> dbf = ps.io.open(ps.examples.get_path('baltim.dbf'))
>>> data = dbf.by_col_array('PRICE', 'LOTSZ', 'SQFT')
>>> my_bins = [1, 10, 20, 40, 80]
>>> cl = [mc.UserDefined.make(bins=my_bins)(a) for a in data.T]
>>> len(cl)
3
>>> cl[0][:10]
array([4, 5, 5, 5, 4, 4, 5, 4, 4, 5])
"""
# only flag overrides return flag
to_annotate = copy.deepcopy(kwargs)
return_object = kwargs.pop("return_object", False)
return_bins = kwargs.pop("return_bins", False)
return_counts = kwargs.pop("return_counts", False)
rolling = kwargs.pop("rolling", False)
if rolling:
# just initialize a fake classifier
data = list(range(10))
cls_instance = cls(data, *args, **kwargs)
# and empty it, since we'll be using the update
cls_instance.y = np.array([])
else:
cls_instance = None
# wrap init in a closure to make a consumer.
# Qc Na: "Objects/Closures are poor man's Closures/Objects"
def classifier(data, cls_instance=cls_instance):
if rolling:
cls_instance.update(data, inplace=True, **kwargs)
yb = cls_instance.find_bin(data)
else:
cls_instance = cls(data, *args, **kwargs)
yb = cls_instance.yb
outs = [yb, None, None, None]
outs[1] = cls_instance if return_object else None
outs[2] = cls_instance.bins if return_bins else None
outs[3] = cls_instance.counts if return_counts else None
outs = [a for a in outs if a is not None]
if len(outs) == 1:
return outs[0]
else:
return outs
# for debugging/jic, keep around the kwargs.
# in future, we might want to make this a thin class, so that we can
# set a custom repr. Call the class `Binner` or something, that's a
# pre-configured Classifier that just consumes data, bins it, &
# possibly updates the bins.
classifier._options = to_annotate
return classifier
def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a copy
estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"k": kwargs.pop("k", self.k)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
def __str__(self):
return self.table()
def __repr__(self):
return self.table()
def table(self):
fmt = self.fmt
return _get_table(self, fmt=fmt)
def __call__(self, *args, **kwargs):
"""
This will allow the classifier to be called like it's a function.
Whether or not we want to make this be "find_bin" or "update" is a
design decision.
I like this as find_bin, since a classifier's job should be to classify
the data given to it using the rules estimated from the `_classify()`
function.
"""
return self.find_bin(*args)
def get_tss(self):
"""
Total sum of squares around class means
Returns sum of squares over all class means
"""
tss = 0
for class_def in self.classes:
if len(class_def) > 0:
yc = self.y[class_def]
css = yc - yc.mean()
css *= css
tss += sum(css)
return tss
def _set_bins(self):
pass
def get_adcm(self):
"""
Absolute deviation around class median (ADCM).
Calculates the absolute deviations of each observation about its class
median as a measure of fit for the classification method.
Returns sum of ADCM over all classes
"""
adcm = 0
for class_def in self.classes:
if len(class_def) > 0:
yc = self.y[class_def]
yc_med = np.median(yc)
ycd = np.abs(yc - yc_med)
adcm += sum(ycd)
return adcm
def get_gadf(self):
"""
Goodness of absolute deviation of fit
"""
adam = (np.abs(self.y - np.median(self.y))).sum()
gadf = 1 - self.adcm / adam
return gadf
def _table_string(self, width=12, decimal=3):
labels, largest = self.get_legend_classes(table=True)
h1 = "Lower"
h1 = h1.center(largest)
h2 = " "
h2 = h2.center(10)
h3 = "Upper"
h3 = h3.center(largest + 1)
largest = "%d" % max(self.counts)
largest = len(largest) + 15
h4 = "Count"
h4 = h4.rjust(largest)
table = []
header = h1 + h2 + h3 + h4
table.append(header)
table.append("=" * len(header))
for i, label in enumerate(labels):
left, right = label.split()
if i == 0:
left = " " * largest
left += " x[i] <= "
else:
left += " < x[i] <= "
row = left + right
cnt = "%d" % self.counts[i]
cnt = cnt.rjust(largest)
row += cnt
table.append(row)
name = self.name
top = name.center(len(row))
table.insert(0, top)
table.insert(1, " ")
table = "\n".join(table)
return table
def find_bin(self, x):
"""
Sort input or inputs according to the current bin estimate
Parameters
----------
x : array or numeric
a value or array of values to fit within the estimated
bins
Returns
-------
a bin index or array of bin indices that classify the input into one of
the classifiers' bins.
Note that this differs from similar functionality in
numpy.digitize(x, classi.bins, right=True).
This will always provide the closest bin, so data "outside" the classifier,
above and below the max/min breaks, will be classified into the nearest bin.
numpy.digitize returns k+1 for data greater than the greatest bin, but retains 0
for data below the lowest bin.
"""
x = np.asarray(x).flatten()
right = np.digitize(x, self.bins, right=True)
if right.max() == len(self.bins):
right[right == len(self.bins)] = len(self.bins) - 1
return right
def get_legend_classes(self, fmt=FMT):
"""
Format the strings for the classes on the legend
Parameters
==========
fmt : string
formatting specification
Returns
=======
classes: list
k strings with class interval definitions
"""
return _get_mpl_labels(self, fmt)
def plot(
self,
gdf,
border_color="lightgrey",
border_width=0.10,
title=None,
legend=False,
cmap="YlGnBu",
axis_on=True,
legend_kwds={"loc": "lower right", "fmt": FMT},
file_name=None,
dpi=600,
ax=None,
):
"""
Plot Mapclassiifer
NOTE: Requires matplotlib, and implicitly requires geopandas
dataframe as input.
Parameters
---------
gdf : geopandas geodataframe
Contains the geometry column for the choropleth map
border_color : string, optional
matplotlib color string to use for polygon border
(Default: lightgrey)
border_width : float, optional
width of polygon boarder
(Default: 0.10)
title : string, optional
Title of map
(Default: None)
cmap : string, optional
matplotlib color string for color map to fill polygons
(Default: YlGn)
axis_on : boolean, optional
Show coordinate axes (default True)
(Default: True)
legend_kwds : dict, optional
options for ax.legend()
(Default: {"loc": "lower right", 'fmt':FMT})
file_name : string, optional
Name of file to save figure to.
(Default: None)
dpi : int, optional
Dots per inch for saved figure
(Default: 600)
ax : matplotlib axis, optional
axis on which to plot the choropleth.
(Default: None, so plots on the current figure)
Returns
-------
f,ax : tuple
matplotlib figure, axis on which the plot is made.
Examples
--------
>>> import libpysal as lp
>>> import geopandas
>>> import mapclassify
>>> gdf = geopandas.read_file(lp.examples.get_path("columbus.shp"))
>>> q5 = mapclassify.Quantiles(gdf.CRIME)
>>> q5.plot(gdf) # doctest: +SKIP
"""
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
"Mapclassify.plot depends on matplotlib.pyplot, and this was"
"not able to be imported. \nInstall matplotlib to"
"plot spatial classifier."
)
if ax is None:
f = plt.figure()
ax = plt.gca()
else:
f = plt.gcf()
ax = gdf.assign(_cl=self.y).plot(
column="_cl",
ax=ax,
cmap=cmap,
edgecolor=border_color,
linewidth=border_width,
scheme=self.name,
legend=legend,
legend_kwds=legend_kwds,
)
if not axis_on:
ax.axis("off")
if title:
f.suptitle(title)
if file_name:
plt.savefig(file_name, dpi=dpi)
return f, ax
[docs]class HeadTailBreaks(MapClassifier):
"""
Head/tail Breaks Map Classification for Heavy-tailed Distributions
Parameters
----------
y : array
(n,1), values to classify
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import numpy as np
>>> import mapclassify as mc
>>> np.random.seed(10)
>>> cal = mc.load_example()
>>> htb = mc.HeadTailBreaks(cal)
>>> htb.k
3
>>> htb.counts
array([50, 7, 1])
>>> htb.bins
array([ 125.92810345, 811.26 , 4111.45 ])
>>> np.random.seed(123456)
>>> x = np.random.lognormal(3, 1, 1000)
>>> htb = mc.HeadTailBreaks(x)
>>> htb.bins
array([ 32.26204423, 72.50205622, 128.07150107, 190.2899093 ,
264.82847377, 457.88157946, 576.76046949])
>>> htb.counts
array([695, 209, 62, 22, 10, 1, 1])
Notes
-----
Head/tail Breaks is a relatively new classification method developed
for data with a heavy-tailed distribution.
Implementation based on contributions by Alessandra Sozzi <alessandra.sozzi@gmail.com>.
For theoretical details see :cite:`Jiang_2013`.
"""
[docs] def __init__(self, y):
MapClassifier.__init__(self, y)
self.name = "HeadTailBreaks"
def _set_bins(self):
x = self.y.copy()
bins = []
bins = head_tail_breaks(x, bins)
self.bins = np.array(bins)
self.k = len(self.bins)
[docs]class EqualInterval(MapClassifier):
"""
Equal Interval Classification
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
Attributes
----------
yb : array
(n,1), bin ids for observations,
each value is the id of the class the observation belongs to
yb[i] = j for j>=1 if bins[j-1] < y[i] <= bins[j], yb[i] = 0
otherwise
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> ei = mc.EqualInterval(cal, k = 5)
>>> ei.k
5
>>> ei.counts
array([57, 0, 0, 0, 1])
>>> ei.bins
array([ 822.394, 1644.658, 2466.922, 3289.186, 4111.45 ])
Notes
-----
Intervals defined to have equal width:
.. math::
bins_j = min(y)+w*(j+1)
with :math:`w=\\frac{max(y)-min(j)}{k}`
"""
[docs] def __init__(self, y, k=K):
"""
see class docstring
"""
self.k = k
MapClassifier.__init__(self, y)
self.name = "EqualInterval"
def _set_bins(self):
y = self.y
k = self.k
max_y = max(y)
min_y = min(y)
rg = max_y - min_y
width = rg * 1.0 / k
cuts = np.arange(min_y + width, max_y + width, width)
if len(cuts) > self.k: # handle overshooting
cuts = cuts[0:k]
cuts[-1] = max_y
bins = cuts.copy()
self.bins = bins
[docs]class Percentiles(MapClassifier):
"""
Percentiles Map Classification
Parameters
----------
y : array
attribute to classify
pct : array
percentiles default=[1,10,50,90,99,100]
Attributes
----------
yb : array
bin ids for observations (numpy array n x 1)
bins : array
the upper bounds of each class (numpy array k x 1)
k : int
the number of classes
counts : int
the number of observations falling in each class
(numpy array k x 1)
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> p = mc.Percentiles(cal)
>>> p.bins
array([1.357000e-01, 5.530000e-01, 9.365000e+00, 2.139140e+02,
2.179948e+03, 4.111450e+03])
>>> p.counts
array([ 1, 5, 23, 23, 5, 1])
>>> p2 = mc.Percentiles(cal, pct = [50, 100])
>>> p2.bins
array([ 9.365, 4111.45 ])
>>> p2.counts
array([29, 29])
>>> p2.k
2
"""
[docs] def __init__(self, y, pct=[1, 10, 50, 90, 99, 100]):
self.pct = pct
MapClassifier.__init__(self, y)
self.name = "Percentiles"
def _set_bins(self):
y = self.y
pct = self.pct
self.bins = np.array([stats.scoreatpercentile(y, p) for p in pct])
self.k = len(self.bins)
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a copy
estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"pct": kwargs.pop("pct", self.pct)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class BoxPlot(MapClassifier):
"""
BoxPlot Map Classification
Parameters
----------
y : array
attribute to classify
hinge : float
multiplier for IQR
Attributes
----------
yb : array
(n,1), bin ids for observations
bins : array
(n,1), the upper bounds of each class (monotonic)
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
low_outlier_ids : array
indices of observations that are low outliers
high_outlier_ids : array
indices of observations that are high outliers
Notes
-----
The bins are set as follows::
bins[0] = q[0]-hinge*IQR
bins[1] = q[0]
bins[2] = q[1]
bins[3] = q[2]
bins[4] = q[2]+hinge*IQR
bins[5] = inf (see Notes)
where q is an array of the first three quartiles of y and
IQR=q[2]-q[0]
If q[2]+hinge*IQR > max(y) there will only be 5 classes and no high
outliers, otherwise, there will be 6 classes and at least one high
outlier.
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> bp = mc.BoxPlot(cal)
>>> bp.bins
array([-5.287625e+01, 2.567500e+00, 9.365000e+00, 3.953000e+01,
9.497375e+01, 4.111450e+03])
>>> bp.counts
array([ 0, 15, 14, 14, 6, 9])
>>> bp.high_outlier_ids
array([ 0, 6, 18, 29, 33, 36, 37, 40, 42])
>>> cal[bp.high_outlier_ids].values
array([ 329.92, 181.27, 370.5 , 722.85, 192.05, 110.74, 4111.45,
317.11, 264.93])
>>> bx = mc.BoxPlot(np.arange(100))
>>> bx.bins
array([-49.5 , 24.75, 49.5 , 74.25, 148.5 ])
"""
[docs] def __init__(self, y, hinge=1.5):
"""
Parameters
----------
y : array (n,1)
attribute to classify
hinge : float
multiple of inter-quartile range (default=1.5)
"""
self.hinge = hinge
MapClassifier.__init__(self, y)
self.name = "BoxPlot"
def _set_bins(self):
y = self.y
pct = [25, 50, 75, 100]
bins = [stats.scoreatpercentile(y, p) for p in pct]
iqr = bins[-2] - bins[0]
self.iqr = iqr
pivot = self.hinge * iqr
left_fence = bins[0] - pivot
right_fence = bins[-2] + pivot
if right_fence < bins[-1]:
bins.insert(-1, right_fence)
else:
bins[-1] = right_fence
bins.insert(0, left_fence)
self.bins = np.array(bins)
self.k = len(bins)
def _classify(self):
MapClassifier._classify(self)
self.low_outlier_ids = np.nonzero(self.yb == 0)[0]
self.high_outlier_ids = np.nonzero(self.yb == 5)[0]
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a
copy estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"hinge": kwargs.pop("hinge", self.hinge)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class Quantiles(MapClassifier):
"""
Quantile Map Classification
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
Attributes
----------
yb : array
(n,1), bin ids for observations,
each value is the id of the class the observation belongs to
yb[i] = j for j>=1 if bins[j-1] < y[i] <= bins[j], yb[i] = 0
otherwise
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> q = mc.Quantiles(cal, k = 5)
>>> q.bins
array([1.46400e+00, 5.79800e+00, 1.32780e+01, 5.46160e+01, 4.11145e+03])
>>> q.counts
array([12, 11, 12, 11, 12])
"""
[docs] def __init__(self, y, k=K):
self.k = k
MapClassifier.__init__(self, y)
self.name = "Quantiles"
def _set_bins(self):
y = self.y
k = self.k
self.bins = quantile(y, k=k)
[docs]class StdMean(MapClassifier):
"""
Standard Deviation and Mean Map Classification
Parameters
----------
y : array
(n,1), values to classify
multiples : array
the multiples of the standard deviation to add/subtract from
the sample mean to define the bins, default=[-2,-1,1,2]
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> st = mc.StdMean(cal)
>>> st.k
5
>>> st.bins
array([-967.36235382, -420.71712519, 672.57333208, 1219.21856072,
4111.45 ])
>>> st.counts
array([ 0, 0, 56, 1, 1])
>>>
>>> st3 = mc.StdMean(cal, multiples = [-3, -1.5, 1.5, 3])
>>> st3.bins
array([-1514.00758246, -694.03973951, 945.8959464 , 1765.86378936,
4111.45 ])
>>> st3.counts
array([ 0, 0, 57, 0, 1])
"""
[docs] def __init__(self, y, multiples=[-2, -1, 1, 2]):
self.multiples = multiples
MapClassifier.__init__(self, y)
self.name = "StdMean"
def _set_bins(self):
y = self.y
s = y.std(ddof=1)
m = y.mean()
cuts = [m + s * w for w in self.multiples]
y_max = y.max()
if cuts[-1] < y_max:
cuts.append(y_max)
self.bins = np.array(cuts)
self.k = len(cuts)
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a copy
estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"multiples": kwargs.pop("multiples", self.multiples)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class MaximumBreaks(MapClassifier):
"""
Maximum Breaks Map Classification
Parameters
----------
y : array
(n, 1), values to classify
k : int
number of classes required
mindiff : float
The minimum difference between class breaks
Attributes
----------
yb : array
(n, 1), bin ids for observations
bins : array
(k, 1), the upper bounds of each class
k : int
the number of classes
counts : array
(k, 1), the number of observations falling in each class (numpy
array k x 1)
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> mb = mc.MaximumBreaks(cal, k = 5)
>>> mb.k
5
>>> mb.bins
array([ 146.005, 228.49 , 546.675, 2417.15 , 4111.45 ])
>>> mb.counts
array([50, 2, 4, 1, 1])
"""
[docs] def __init__(self, y, k=5, mindiff=0):
self.k = k
self.mindiff = mindiff
MapClassifier.__init__(self, y)
self.name = "MaximumBreaks"
def _set_bins(self):
xs = self.y.copy()
k = self.k
xs.sort()
min_diff = self.mindiff
diffs = xs[1:] - xs[:-1]
idxs = np.argsort(diffs)
k1 = k - 1
ud = np.unique(diffs)
if len(ud) < k1:
print("Insufficient number of unique diffs. Breaks are random.")
mp = []
for c in range(1, k):
idx = idxs[-c]
cp = (xs[idx] + xs[idx + 1]) / 2.0
mp.append(cp)
mp.append(xs[-1])
mp.sort()
self.bins = np.array(mp)
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a copy
estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"k": kwargs.pop("k", self.k)})
kwargs.update({"mindiff": kwargs.pop("mindiff", self.mindiff)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class NaturalBreaks(MapClassifier):
"""
Natural Breaks Map Classification
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
initial : int, default: 10
Number of initial solutions generated with different centroids. Best of initial results is returned.
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import numpy as np
>>> import mapclassify as mc
>>> np.random.seed(123456)
>>> cal = mc.load_example()
>>> nb = mc.NaturalBreaks(cal, k=5)
>>> nb.k
5
>>> nb.counts
array([49, 3, 4, 1, 1])
>>> nb.bins
array([ 75.29, 192.05, 370.5 , 722.85, 4111.45])
>>> x = np.array([1] * 50)
>>> x[-1] = 20
>>> nb = mc.NaturalBreaks(x, k = 5)
Warning: Not enough unique values in array to form k classes
Warning: setting k to 2
>>> nb.bins
array([ 1, 20])
>>> nb.counts
array([49, 1])
"""
[docs] def __init__(self, y, k=K, initial=10):
self.k = k
self.init = initial
MapClassifier.__init__(self, y)
self.name = "NaturalBreaks"
def _set_bins(self):
x = self.y.copy()
k = self.k
values = np.array(x)
uv = np.unique(values)
uvk = len(uv)
if uvk < k:
ms = "Warning: Not enough unique values in array to form k classes"
Warn(ms, UserWarning)
Warn("Warning: setting k to %d" % uvk, UserWarning)
k = uvk
uv.sort()
# we set the bins equal to the sorted unique values and ramp k
# downwards. no need to call kmeans.
self.bins = uv
self.k = k
else:
res0 = natural_breaks(x, k, init=self.init)
fit = res0[2]
self.bins = np.array(res0[-1])
self.k = len(self.bins)
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a
copy estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"k": kwargs.pop("k", self.k)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class FisherJenks(MapClassifier):
"""
Fisher Jenks optimal classifier - mean based
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
Attributes
----------
yb : array
(n,1), bin ids for observations
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> fj = mc.FisherJenks(cal)
>>> fj.adcm
799.24
>>> fj.bins
array([ 75.29, 192.05, 370.5 , 722.85, 4111.45])
>>> fj.counts
array([49, 3, 4, 1, 1])
>>>
"""
[docs] def __init__(self, y, k=K):
nu = len(np.unique(y))
if nu < k:
raise ValueError("Fewer unique values than specified classes.")
self.k = k
MapClassifier.__init__(self, y)
self.name = "FisherJenks"
def _set_bins(self):
x = self.y.copy()
self.bins = np.array(_fisher_jenks_means(x, classes=self.k)[1:])
[docs]class FisherJenksSampled(MapClassifier):
"""
Fisher Jenks optimal classifier - mean based using random sample
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
pct : float
The percentage of n that should form the sample
If pct is specified such that n*pct > 1000, then
pct = 1000./n, unless truncate is False
truncate : boolean
truncate pct in cases where pct * n > 1000., (Default True)
Attributes
----------
yb : array
(n,1), bin ids for observations
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Notes
-----
For theoretical details see :cite:`Rey_2016`.
"""
[docs] def __init__(self, y, k=K, pct=0.10, truncate=True):
self.k = k
n = y.size
if (pct * n > 1000) and truncate:
pct = 1000.0 / n
ids = np.random.randint(0, n, int(n * pct))
yr = y[ids]
yr[-1] = max(y) # make sure we have the upper bound
yr[0] = min(y) # make sure we have the min
self.original_y = y
self.pct = pct
self._truncated = truncate
self.yr = yr
self.yr_n = yr.size
MapClassifier.__init__(self, yr)
self.yb, self.counts = bin1d(y, self.bins)
self.name = "FisherJenksSampled"
self.y = y
self._summary() # have to recalculate summary stats
def _set_bins(self):
fj = FisherJenks(self.y, self.k)
self.bins = fj.bins
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a
copy estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"k": kwargs.pop("k", self.k)})
kwargs.update({"pct": kwargs.pop("pct", self.pct)})
kwargs.update({"truncate": kwargs.pop("truncate", self._truncated)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class JenksCaspall(MapClassifier):
"""
Jenks Caspall Map Classification
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> jc = mc.JenksCaspall(cal, k = 5)
>>> jc.bins
array([1.81000e+00, 7.60000e+00, 2.98200e+01, 1.81270e+02, 4.11145e+03])
>>> jc.counts
array([14, 13, 14, 10, 7])
"""
[docs] def __init__(self, y, k=K):
self.k = k
MapClassifier.__init__(self, y)
self.name = "JenksCaspall"
def _set_bins(self):
x = self.y.copy()
k = self.k
# start with quantiles
q = quantile(x, k)
solving = True
xb, cnts = bin1d(x, q)
# class means
if x.ndim == 1:
x.shape = (x.size, 1)
n, k = x.shape
xm = [np.median(x[xb == i]) for i in np.unique(xb)]
xb0 = xb.copy()
q = xm
it = 0
rk = list(range(self.k))
while solving:
xb = np.zeros(xb0.shape, int)
d = abs(x - q)
xb = d.argmin(axis=1)
if (xb0 == xb).all():
solving = False
else:
xb0 = xb
it += 1
q = np.array([np.median(x[xb == i]) for i in rk])
cuts = np.array([max(x[xb == i]) for i in np.unique(xb)])
cuts.shape = (len(cuts),)
self.bins = cuts
self.iterations = it
[docs]class JenksCaspallSampled(MapClassifier):
"""
Jenks Caspall Map Classification using a random sample
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
pct : float
The percentage of n that should form the sample
If pct is specified such that n*pct > 1000, then pct = 1000./n
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> x = np.random.random(100000)
>>> jc = mc.JenksCaspall(x)
>>> jcs = mc.JenksCaspallSampled(x)
>>> jc.bins
array([0.1988721 , 0.39624334, 0.59441487, 0.79624357, 0.99999251])
>>> jcs.bins
array([0.20998558, 0.42112792, 0.62752937, 0.80543819, 0.99999251])
>>> jc.counts
array([19943, 19510, 19547, 20297, 20703])
>>> jcs.counts
array([21039, 20908, 20425, 17813, 19815])
# not for testing since we get different times on different hardware
# just included for documentation of likely speed gains
#>>> t1 = time.time(); jc = Jenks_Caspall(x); t2 = time.time()
#>>> t1s = time.time(); jcs = Jenks_Caspall_Sampled(x); t2s = time.time()
#>>> t2 - t1; t2s - t1s
#1.8292930126190186
#0.061631917953491211
Notes
-----
This is intended for large n problems. The logic is to apply
Jenks_Caspall to a random subset of the y space and then bin the
complete vector y on the bins obtained from the subset. This would
trade off some "accuracy" for a gain in speed.
"""
[docs] def __init__(self, y, k=K, pct=0.10):
self.k = k
n = y.size
if pct * n > 1000:
pct = 1000.0 / n
ids = np.random.randint(0, n, int(n * pct))
yr = y[ids]
yr[0] = max(y) # make sure we have the upper bound
self.original_y = y
self.pct = pct
self.yr = yr
self.yr_n = yr.size
MapClassifier.__init__(self, yr)
self.yb, self.counts = bin1d(y, self.bins)
self.name = "JenksCaspallSampled"
self.y = y
self._summary() # have to recalculate summary stats
def _set_bins(self):
jc = JenksCaspall(self.y, self.k)
self.bins = jc.bins
self.iterations = jc.iterations
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a
copy estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"k": kwargs.pop("k", self.k)})
kwargs.update({"pct": kwargs.pop("pct", self.pct)})
if inplace:
self._update(y, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, **kwargs)
return new
[docs]class JenksCaspallForced(MapClassifier):
"""
Jenks Caspall Map Classification with forced movements
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
Attributes
----------
yb : array
(n,1), bin ids for observations
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> jcf = mc.JenksCaspallForced(cal, k = 5)
>>> jcf.k
5
>>> jcf.bins
array([1.34000e+00, 5.90000e+00, 1.67000e+01, 5.06500e+01, 4.11145e+03])
>>> jcf.counts
array([12, 12, 13, 9, 12])
>>> jcf4 = mc.JenksCaspallForced(cal, k = 4)
>>> jcf4.k
4
>>> jcf4.bins
array([2.51000e+00, 8.70000e+00, 3.66800e+01, 4.11145e+03])
>>> jcf4.counts
array([15, 14, 14, 15])
"""
[docs] def __init__(self, y, k=K):
self.k = k
MapClassifier.__init__(self, y)
self.name = "JenksCaspallForced"
def _set_bins(self):
x = self.y.copy()
k = self.k
q = quantile(x, k)
solving = True
xb, cnt = bin1d(x, q)
# class means
if x.ndim == 1:
x.shape = (x.size, 1)
n, tmp = x.shape
xm = [x[xb == i].mean() for i in np.unique(xb)]
q = xm
xbar = np.array([xm[xbi] for xbi in xb])
xbar.shape = (n, 1)
ss = x - xbar
ss *= ss
ss = sum(ss)
down_moves = up_moves = 0
solving = True
it = 0
while solving:
# try upward moves first
moving_up = True
while moving_up:
class_ids = np.unique(xb)
nk = [sum(xb == j) for j in class_ids]
candidates = nk[:-1]
i = 0
up_moves = 0
while candidates:
nki = candidates.pop(0)
if nki > 1:
ids = np.nonzero(xb == class_ids[i])
mover = max(ids[0])
tmp = xb.copy()
tmp[mover] = xb[mover] + 1
tm = [x[tmp == j].mean() for j in np.unique(tmp)]
txbar = np.array([tm[xbi] for xbi in tmp])
txbar.shape = (n, 1)
tss = x - txbar
tss *= tss
tss = sum(tss)
if tss < ss:
xb = tmp
ss = tss
candidates = []
up_moves += 1
i += 1
if not up_moves:
moving_up = False
moving_down = True
while moving_down:
class_ids = np.unique(xb)
nk = [sum(xb == j) for j in class_ids]
candidates = nk[1:]
i = 1
down_moves = 0
while candidates:
nki = candidates.pop(0)
if nki > 1:
ids = np.nonzero(xb == class_ids[i])
mover = min(ids[0])
mover_class = xb[mover]
target_class = mover_class - 1
tmp = xb.copy()
tmp[mover] = target_class
tm = [x[tmp == j].mean() for j in np.unique(tmp)]
txbar = np.array([tm[xbi] for xbi in tmp])
txbar.shape = (n, 1)
tss = x - txbar
tss *= tss
tss = sum(tss)
if tss < ss:
xb = tmp
ss = tss
candidates = []
down_moves += 1
i += 1
if not down_moves:
moving_down = False
if not up_moves and not down_moves:
solving = False
it += 1
cuts = [max(x[xb == c]) for c in np.unique(xb)]
cuts = np.reshape(np.array(cuts), (k,))
self.bins = cuts
self.iterations = it
[docs]class UserDefined(MapClassifier):
"""
User Specified Binning
Parameters
----------
y : array
(n,1), values to classify
bins : array
(k,1), upper bounds of classes (have to be monotically increasing)
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> bins = [20, max(cal)]
>>> bins
[20, 4111.45]
>>> ud = mc.UserDefined(cal, bins)
>>> ud.bins
array([ 20. , 4111.45])
>>> ud.counts
array([37, 21])
>>> bins = [20, 30]
>>> ud = mc.UserDefined(cal, bins)
>>> ud.bins
array([ 20. , 30. , 4111.45])
>>> ud.counts
array([37, 4, 17])
Notes
-----
If upper bound of user bins does not exceed max(y) we append an
additional bin.
"""
[docs] def __init__(self, y, bins):
if bins[-1] < max(y):
bins = np.append(bins, max(y))
self.k = len(bins)
self.bins = np.array(bins)
self.y = y
MapClassifier.__init__(self, y)
self.name = "UserDefined"
def _set_bins(self):
pass
def _update(self, y=None, bins=None):
if y is not None:
if hasattr(y, "values"):
y = y.values
y = np.append(y.flatten(), self.y)
else:
y = self.y
if bins is None:
bins = self.bins
self.__init__(y, bins)
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a
copy estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
bins = kwargs.pop("bins", self.bins)
if inplace:
self._update(y=y, bins=bins, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, bins, **kwargs)
return new
# We have to override the plot method for additional kwargs for UserDefined
[docs] def plot(
self,
gdf,
border_color="lightgrey",
border_width=0.10,
title=None,
legend=False,
cmap="YlGnBu",
axis_on=True,
legend_kwds={"loc": "lower right", "fmt": FMT},
file_name=None,
dpi=600,
ax=None,
):
try:
import matplotlib.pyplot as plt
except ImportError:
raise ImportError(
"Mapclassify.plot depends on matplotlib.pyplot, and this was"
"not able to be imported. \nInstall matplotlib to"
"plot spatial classifier."
)
if ax is None:
f = plt.figure()
ax = plt.gca()
else:
f = plt.gcf()
fmt = FMT
if "fmt" in legend_kwds:
fmt = legend_kwds.pop("fmt")
ax = gdf.assign(_cl=self.y).plot(
column="_cl",
ax=ax,
cmap=cmap,
edgecolor=border_color,
linewidth=border_width,
scheme=self.name,
legend=legend,
legend_kwds=legend_kwds,
classification_kwds={"bins": self.bins}, # for UserDefined
)
if not axis_on:
ax.axis("off")
if title:
f.suptitle(title)
if file_name:
plt.savefig(file_name, dpi=dpi)
return f, ax
[docs]class MaxP(MapClassifier):
"""
MaxP Map Classification
Based on Max-p regionalization algorithm
Parameters
----------
y : array
(n,1), values to classify
k : int
number of classes required
initial : int
number of initial solutions to use prior to swapping
Attributes
----------
yb : array
(n,1), bin ids for observations,
bins : array
(k,1), the upper bounds of each class
k : int
the number of classes
counts : array
(k,1), the number of observations falling in each class
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> mp = mc.MaxP(cal)
>>> mp.bins
array([ 8.7 , 16.7 , 20.47, 66.26, 4111.45])
>>> mp.counts
array([29, 8, 1, 10, 10])
"""
[docs] def __init__(self, y, k=K, initial=1000):
self.k = k
self.initial = initial
MapClassifier.__init__(self, y)
self.name = "MaxP"
def _set_bins(self):
x = self.y.copy()
k = self.k
q = quantile(x, k)
if x.ndim == 1:
x.shape = (x.size, 1)
n, tmp = x.shape
x.sort(axis=0)
# find best of initial solutions
solution = 0
best_tss = x.var() * x.shape[0]
tss_all = np.zeros((self.initial, 1))
while solution < self.initial:
remaining = list(range(n))
seeds = [
np.nonzero(di == min(di))[0][0] for di in [np.abs(x - qi) for qi in q]
]
rseeds = np.random.permutation(list(range(k))).tolist()
[remaining.remove(seed) for seed in seeds]
self.classes = classes = []
[classes.append([seed]) for seed in seeds]
while rseeds:
seed_id = rseeds.pop()
current = classes[seed_id]
growing = True
while growing:
current = classes[seed_id]
low = current[0]
high = current[-1]
left = low - 1
right = high + 1
move_made = False
if left in remaining:
current.insert(0, left)
remaining.remove(left)
move_made = True
if right in remaining:
current.append(right)
remaining.remove(right)
move_made = True
if move_made:
classes[seed_id] = current
else:
growing = False
tss = _fit(self.y, classes)
tss_all[solution] = tss
if tss < best_tss:
best_solution = classes
best_it = solution
best_tss = tss
solution += 1
classes = best_solution
self.best_it = best_it
self.tss = best_tss
self.a2c = a2c = {}
self.tss_all = tss_all
for r, cl in enumerate(classes):
for a in cl:
a2c[a] = r
swapping = True
while swapping:
rseeds = np.random.permutation(list(range(k))).tolist()
total_moves = 0
while rseeds:
id = rseeds.pop()
growing = True
total_moves = 0
while growing:
target = classes[id]
left = target[0] - 1
right = target[-1] + 1
n_moves = 0
if left in a2c:
left_class = classes[a2c[left]]
if len(left_class) > 1:
a = left_class[-1]
if self._swap(left_class, target, a):
target.insert(0, a)
left_class.remove(a)
a2c[a] = id
n_moves += 1
if right in a2c:
right_class = classes[a2c[right]]
if len(right_class) > 1:
a = right_class[0]
if self._swap(right_class, target, a):
target.append(a)
right_class.remove(a)
n_moves += 1
a2c[a] = id
if not n_moves:
growing = False
total_moves += n_moves
if not total_moves:
swapping = False
xs = self.y.copy()
xs.sort()
self.bins = np.array([xs[cl][-1] for cl in classes])
def _ss(self, class_def):
"""calculates sum of squares for a class"""
yc = self.y[class_def]
css = yc - yc.mean()
css *= css
return sum(css)
def _swap(self, class1, class2, a):
"""evaluate cost of moving a from class1 to class2"""
ss1 = self._ss(class1)
ss2 = self._ss(class2)
tss1 = ss1 + ss2
class1c = copy.copy(class1)
class2c = copy.copy(class2)
class1c.remove(a)
class2c.append(a)
ss1 = self._ss(class1c)
ss2 = self._ss(class2c)
tss2 = ss1 + ss2
if tss1 < tss2:
return False
else:
return True
[docs] def update(self, y=None, inplace=False, **kwargs):
"""
Add data or change classification parameters.
Parameters
----------
y : array
(n,1) array of data to classify
inplace : bool
whether to conduct the update in place or to return a
copy estimated from the additional specifications.
Additional parameters provided in **kwargs are passed to the init
function of the class. For documentation, check the class constructor.
"""
kwargs.update({"initial": kwargs.pop("initial", self.initial)})
if inplace:
self._update(y, bins, **kwargs)
else:
new = copy.deepcopy(self)
new._update(y, bins, **kwargs)
return new
def _fit(y, classes):
"""Calculate the total sum of squares for a vector y classified into
classes
Parameters
----------
y : array
(n,1), variable to be classified
classes : array
(k,1), integer values denoting class membership
"""
tss = 0
for class_def in classes:
yc = y[class_def]
css = yc - yc.mean()
css *= css
tss += sum(css)
return tss
kmethods = {}
kmethods["Quantiles"] = Quantiles
kmethods["FisherJenks"] = FisherJenks
kmethods["NaturalBreaks"] = NaturalBreaks
kmethods["MaximumBreaks"] = MaximumBreaks
[docs]def gadf(y, method="Quantiles", maxk=15, pct=0.8):
r"""
Evaluate the Goodness of Absolute Deviation Fit of a Classifier
Finds the minimum value of k for which gadf>pct
Parameters
----------
y : array
(n, 1) values to be classified
method : {'Quantiles, 'Fisher_Jenks', 'Maximum_Breaks', 'Natrual_Breaks'}
maxk : int
maximum value of k to evaluate
pct : float
The percentage of GADF to exceed
Returns
-------
k : int
number of classes
cl : object
instance of the classifier at k
gadf : float
goodness of absolute deviation fit
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> qgadf = mc.classifiers.gadf(cal)
>>> qgadf[0]
15
>>> qgadf[-1]
0.3740257590909283
Quantiles fail to exceed 0.80 before 15 classes. If we lower the bar to
0.2 we see quintiles as a result
>>> qgadf2 = mc.classifiers.gadf(cal, pct = 0.2)
>>> qgadf2[0]
5
>>> qgadf2[-1]
0.21710231966462412
>>>
Notes
-----
The GADF is defined as:
.. math::
GADF = 1 - \sum_c \sum_{i \in c}
|y_i - y_{c,med}| / \sum_i |y_i - y_{med}|
where :math:`y_{med}` is the global median and :math:`y_{c,med}` is
the median for class :math:`c`.
See Also
--------
KClassifiers
"""
y = np.array(y)
adam = (np.abs(y - np.median(y))).sum()
for k in range(2, maxk + 1):
cl = kmethods[method](y, k)
gadf = 1 - cl.adcm / adam
if gadf > pct:
break
return (k, cl, gadf)
[docs]class KClassifiers(object):
"""
Evaluate all k-classifers and pick optimal based on k and GADF
Parameters
----------
y : array
(n,1), values to be classified
pct : float
The percentage of GADF to exceed
Attributes
----------
best : object
instance of the optimal MapClassifier
results : dictionary
keys are classifier names, values are the MapClassifier
instances with the best pct for each classifer
Examples
--------
>>> import mapclassify as mc
>>> cal = mc.load_example()
>>> ks = mc.classifiers.KClassifiers(cal)
>>> ks.best.name
'FisherJenks'
>>> ks.best.k
4
>>> ks.best.gadf
0.8481032719908105
Notes
-----
This can be used to suggest a classification scheme.
See Also
--------
gadf
"""
[docs] def __init__(self, y, pct=0.8):
results = {}
best = gadf(y, "FisherJenks", maxk=len(y) - 1, pct=pct)
pct0 = best[0]
k0 = best[-1]
keys = list(kmethods.keys())
keys.remove("FisherJenks")
results["FisherJenks"] = best
for method in keys:
results[method] = gadf(y, method, maxk=len(y) - 1, pct=pct)
k1 = results[method][0]
pct1 = results[method][-1]
if (k1 < k0) or (k1 == k0 and pct0 < pct1):
best = results[method]
k0 = k1
pct0 = pct1
self.results = results
self.best = best[1]