-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLatticeCover.py
404 lines (364 loc) · 20.5 KB
/
LatticeCover.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils import check_array
from sklearn.utils.validation import check_is_fitted
from gtda.utils.validation import validate_params
from gtda.utils.intervals import Interval
import warnings
from gtda.mapper.utils._cover import _remove_empty_and_duplicate_intervals
def cartesian_product(arrays):
la = len(arrays)
dtype = np.result_type(*arrays)
arr = np.empty([len(a) for a in arrays] + [la])
for i, a in enumerate(np.ix_(*arrays)):
arr[...,i] = a
return arr.reshape(-1, la)
class LatticeCover(BaseEstimator, TransformerMixin):
''' Cover data with balls centered at points from a chosen root lattice.
In :meth: `fit`, given a training array `X` representing a collection of
real numbers, a cover of the plane by balls or hypercubes centered at a
root lattice is constructed based on the distribution of values in `X`.
In :meth: `transform`, the cover is applied to a new array `X'` to yield
a cover of `X'`.
Two kinds of cover are currently available: "triangular" and "cubical". The cubical cover is constructed from hypercubes whose centers lie on a scaled integer root lattice :math: `\mathbb{Z}^n`. This will yield hypercubes of equal length in all dimensions. The triangular cover is generated from the dual root lattice :math: `A_n^*`, also known as the *triangular lattice* in dimension 2 and the *body-centered cubic lattice* in dimension 3. The centers of all spheres from both types of covers are constructed from scaled generating matrices :math: `M`, and the number of points in each dimension of the lattice is determined by :math: `\max_{1\leq i \leq N+1}\frac{1}{K} |M_i - m_i|`, where :math:`K` is the parameter `n_intervals`.
Parameters
----------
n_intervals : int, optional, default: 10.
The number of intervals to construct the lattice.
overlap_frac : float, optional, default 0.3.
The overlap fraction between cover sets.
kind : ``'triangular'`` | ``'cubical'``, default ``'triangular'``.
Determines the underlying root lattice for center points, with ``'triangular'`` constructing :math: `A_n^*` and ``'cubical'`` constructing :math: `\mathbb{Z}^n`.
special : bool, default False. If `True`, `kind = 'triangular'`, and the filtered data is 2 or 3 dimensional, then the cover centers will be constructed with the special generating matrices
$$M =
\begin{bmatrix}
1 & 0 \\
\frac{-1}{2} & \frac{\sqrt{3}}{2} \\
\end{bmatrix},\qquad
M =
\begin{bmatrix}
2 & 0 & 0 \\
0 & 2 & 0 \\
1 & 1 & 1 \\
\end{bmatrix}
$$
Attributes
-----------
dim : int
Number of features in the data set computed in :meth: `fit`.
left_limits_ : ndarray of shape (dim,)
Minimum limits of the bounding box in all dimensions computed in :meth: `fit`.
right_limits_ : ndarray of shape (dim,)
ball_centers_ : ndarray of shape at most ( large product see notes , dim + 1)
For ``'triangular'`` cover only.
Centers of every ball in the cover. The number of balls are dependent on `n_intervals`, `left_limits_`, and `right_limits_`.
ball_radius_ : float
The radius for each ball in the cover. For ``'triangular'`` cover only.
left_interval_limits_ : ndarray of shape (c, dim)
For ``'cubical'`` cover only.
Left limits for each hypercube in the cover. The number of hypercubes
are dependent on `n_intervals`, `left_limits_` and `right_limits_`.
right_interval_limits_ : ndarray of shape (c, dim)
For ``'cubical'`` cover only. The number of hypercubes
are dependent on `n_intervals`, `left_limits_` and `right_limits_`.
'''
# Parameters
_hyperparameters = {
'kind': {'type': str, 'in': ['triangular', 'cubical']},
'n_intervals': {'type': int, 'in': Interval(1, np.inf, closed='left')},
'overlap_frac': {'type': float, 'in': Interval(0, 1, closed = 'neither')},
'special': {'type': bool}
}
def __init__(self, n_intervals = 10, overlap_frac = 0.3, kind = 'triangular', special = False):
self.n_intervals = n_intervals
self.overlap_frac = overlap_frac
self.special = special
self.kind = kind
def _fit_triangular(self, X):
self.dim = self._check_dim(X)
if self.special is False:
X = self._hyperplane_embed(X)
self.left_limits_, self.right_limits_ = self._find_bounding_box(X,
self.dim,
self.n_intervals,
self.special,
self.kind
)
self.ball_centers_, self.ball_radius_ = self._triangular_lattice_cover_limits(self,
self.left_limits_,
self.right_limits_,
self.dim,
self.n_intervals,
self.overlap_frac,
self.special
)
return self
def _fit_cubical(self, X):
self.dim = self._check_dim(X)
self.left_limits_, self.right_limits_ = self._find_bounding_box(X,
self.dim,
self.n_intervals,
self.special,
self.kind
)
self.interval_centers_, self.left_interval_limits_, self.right_interval_limits_ = self._cubical_lattice_cover_limits(self,
self.left_limits_,
self.right_limits_,
self.dim,
self.n_intervals,
self.overlap_frac,
self.special
)
return self
def fit(self, X, y=None):
"""Compute all cover interval limits according to `X`
and store them in :attr:`left_limits_`, :attr:`right_limits_`.
For the ``'triangular'`` cover we additionally compute the
ball centers and radii and store them in
:attr:`ball_centers_`, and :attr:`ball_radius_`.
For the ``'cubical'`` cover we additionally compute each hypercube
interval limits and store them in :attr:`left_interval_limits_` and
:attr:`right_interval_limits_`.
Then return the estimator.
This method is here to implement the usual scikit-learn API and hence
work in pipelines.
Parameters
----------
X : ndarray of shape (n_samples,) or (n_samples,1)
Input data.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
----------
self : object
"""
X = check_array(X)
validate_params(self.get_params(), self._hyperparameters)
if self.overlap_frac <= 1e-8:
warmings.warn("`overlap_frac` is close to zero, "
"which might cause numerical issues and errors.",
RuntimeWarning)
is_triangular = self.kind == 'triangular'
fitter = self._fit_triangular if is_triangular else self._fit_cubical
return fitter(X)
def _triangular_transform_data(self, X):
data_bools = np.full((self.ball_centers_.shape[0],), False)
for i in range(X.shape[0]):
cover_check = np.linalg.norm(self.ball_centers_ - X[i], axis = 1) < self.ball_radius_
if np.any(cover_check):
data_bools = np.vstack([data_bools, cover_check])
else:
print('cover check failed at point')
print('point', i, X[i])
print('radius', self.ball_radius_)
print(np.linalg.norm(self.ball_centers_ - X[i], axis=1))
print(cover_check)
return data_bools[1:]
def _triangular_transform_centers(self, X):
cover_bools = np.full((X.shape[0],), False)
for i in range(self.ball_centers_.shape[0]):
cover_check = np.linalg.norm(X - self.ball_centers_[i], axis = 1) < self.ball_radius_
if np.any(cover_check):
cover_bools = np.vstack([cover_bools, cover_check])
data_bools = cover_bools[1:].T
return data_bools
def _triangular_transform(self, X):
if self.ball_centers_.shape[0] <= X.shape[0]:
data_bools = self._triangular_transform_data(X)
if X.shape[0] < self.ball_centers_.shape[0]:
data_bools = self._triangular_transform_centers(X)
return data_bools
def _cubical_transform_data(self, X):
data_bools = np.full((self.left_interval_limits_.shape[0],), False)
for i in range(X.shape[0]):
cover_check = np.all(np.logical_and(X[i] > self.left_interval_limits_,
X[i] < self.right_interval_limits_), axis = 1)
if np.any(cover_check):
data_bools = np.vstack([data_bools, cover_check])
else:
print('cover check failed at point')
print('point', i, X[i])
print('left limits', self.left_interval_limits_)
print('right limits', self.right_interval_limits_)
print(np.logical_and(X[i] > self.left_interval_limits_,
X[i] < self.right_interval_limits_))
print(cover_check)
return data_bools[1:]
def _cubical_transform(self, X):
return self._cubical_transform_data(X)
def transform(self, X, y=None):
"""Compute a cover of `X` according to the cover of the space
computed in :meth:`fit`, and return it as a (n_samples, num_coversets)
boolean array. Each column indicates the location of the entries in
`X` corresponding to a common cover set.
Parameters
-----------
X : ndarray of shape (n_samples, dim)
Input data.
y : None
There is no need for a target in a transformer, yet the pipeline
API require this parameter.
Returns
--------
Xt : ndarray of shape (n_samples, num_coversets)
Encoding the cover of `X` as a boolean array. In general,
`num_coversets` is less than or equal to `n_intervals`**dim
as empty or duplicated cover sets are removed.
"""
check_is_fitted(self)
Xt = check_array(X)
is_triangular = self.kind == 'triangular'
Xt = self._triangular_transform(Xt) if is_triangular else self._cubical_transform(Xt)
Xt = _remove_empty_and_duplicate_intervals(Xt)
return Xt
def _fit_transform(self, X):
is_triangular = self.kind == 'triangular'
if is_triangular:
if self.special:
Xt = self._fit_triangular(X)._triangular_transform(X)
else:
Xt = self._fit_triangular(X)._triangular_transform(self._hyperplane_embed(X))
else:
Xt = self._fit_cubical(X)._cubical_transform(X)
return Xt
def fit_transform(self, X, y=None, **fit_params):
"""Fit the data, then transform it.
Parameters
----------
X : ndarray of shape (n_samples, dim)
Input data.
y : None
There is no need for a target in a transformer, yet the pipeline
API requires this parameter.
Returns
--------
Xt : ndarray of shape (n_samples, num_coversets)
Encoding the cover of `X` as a boolean array. In general,
`num_coversets` is less than or equal to `n_intervals`**dim
as empty or duplicated cover sets are removed.
"""
Xt = check_array(X)
validate_params(self.get_params(), self._hyperparameters)
Xt = self._fit_transform(Xt)
Xt = _remove_empty_and_duplicate_intervals(Xt)
return Xt
def _check_dim(self, X):
if X.shape[1] > 5:
warnings.warn("Using an incredibly high dimensional (dim {X.shape[1]}) can be dangerous; Kernel destroying, in fact. Proceed with Caution")
return X.shape[1]
def _hyperplane_embed(self, X):
'''Embeds our data X\sub R^{dim} \righthookarrow R^{dim+1}.
Used when ``kind = 'triangular'`` and ``special = False``.
'''
embed = -np.sum(X,axis=1).T
return np.c_[X, embed]
def _find_bounding_box(self, X, dim, n_intervals, special, kind):
"""Finds bounds for each coordinate over data set X.
Outputs a ndarray of shape (dim+1,2) if ``special = True``
and ``kind = 'triangular'`` or a ndarray of shape (dim,2)
if ``kind = 'cubical`` or (``kind = 'triangular'`` and
``special = True``).
"""
is_cubical = kind == 'cubical'
if special and (dim not in {2,3}):
raise ValueError(f'We only have special lattice representations in dimensions 2 and 3.')
coord_array = np.zeros((dim+1,2)) # Embed image into R^{dim+1}
for i in range(dim+1):
if (special or is_cubical) and i == dim:
coord_array[i] = [0,0]
else:
coord_array[i,0] = np.min(X[:,i]) # Minimum value in i-th coord
coord_array[i,1] = np.max(X[:,i]) # Maximum value in i-th coord
only_one_pt = all( _ == coord_array.ravel()[0] for _ in coord_array.ravel())
if only_one_pt and n_intervals > 1:
raise ValueError(
f"Only one unique filter value found, cannot fit"
f"{n_intervals} > 1 intervals.")
if special or is_cubical: # We have special representations for A* in dimensions 2 and 3.
return coord_array[:dim, 0], coord_array[:dim, 1]
else:
return coord_array[:,0], coord_array[:,1]
@staticmethod
def _get_generator_matrix(self, dim, special):
"""Create generating matrix M for ``kind = 'triangular'`` covers.
M is has two special representations chosen when ``special = True``
and `X` has dimension 2 or 3. In general, M will be (dim, dim + 1)
matrix.
"""
if dim == 2 and special:
basis_vectors = np.array([1,0,-1/2,np.sqrt(3)/2]).reshape((2,2))
if dim == 3 and special:
basis_vectors = np.array([2,0,0,0,2,0,1,1,1]).reshape((3,3))
if dim not in {2,3} or not special:
basis_vectors = np.zeros((dim, dim+1))
basis_vectors[dim-1, 0] = -dim/(dim+1)
basis_vectors[dim-1, dim] = 1/(dim+1)
for i in range(dim-1):
basis_vectors[i,0] = 1
basis_vectors[i,i+1] = -1
basis_vectors[dim-1,i+1] = 1/(dim+1)
generator_matrix = np.asmatrix(basis_vectors)
return generator_matrix
@staticmethod
def _triangular_lattice_cover_limits(self, left_limits, right_limits, dim, n_intervals, overlap_frac, special):
"""Contruct the scaled :math:`A_n^*` lattice and output sphere centers
(lattice points) and common radius for each ball.
"""
generating_matrix = self._get_generator_matrix(self, dim, special)
ldim = generating_matrix.shape[1]
assert left_limits.shape[0] == right_limits.shape[0] == ldim, f'{left_limits.shape[0]} != {right_limits.shape[0]} != {ldim}'
bound_vector = np.abs(right_limits - left_limits)
scale = np.max([np.average(bound_vector/n_intervals), 1])
# Create bounds for lattice points
scaled_min_bound, scaled_max_bound = np.zeros(dim), np.zeros(dim)
if special and dim == 2:
cover_radius = 1 / np.sqrt(3)
scaled_min_bound[0] = np.floor((left_limits[0] + left_limits[1]/np.sqrt(3))/scale)
scaled_min_bound[1] = np.floor(2 * left_limits[1] / (scale *np.sqrt(3)))
scaled_max_bound[0] = np.ceil((right_limits[0] + (right_limits[1]/np.sqrt(3)))/ scale)
scaled_max_bound[1] = np.ceil(2 * right_limits[1] / (scale * np.sqrt(3)))
if special and dim == 3:
cover_radius = np.sqrt(5) / 2
scaled_min_bound[2] = left_limits[2] / scale
scaled_max_bound[2] = right_limits[2] / scale
for i in range(2):
scaled_min_bound[i] = (left_limits[i] - right_limits[2]) / (2 * scale)
scaled_max_bound[i] = (right_limits[i] - left_limits[2]) / (2 * scale)
if not special:
cover_radius = np.sqrt(dim * (dim + 2)/(12* (dim + 1)))
scaled_min_bound[:dim-1] = np.asarray([np.floor((left_limits[dim] - right_limits[j+1])/scale) for j in range(dim-1)])
scaled_min_bound[dim-1] = np.floor((ldim) * left_limits[dim] / scale)
scaled_max_bound[:dim-1] = np.asarray([np.ceil((right_limits[dim] - left_limits[j+1])/scale) for j in range(dim-1)])
scaled_max_bound[dim-1] = np.ceil((ldim) * right_limits[dim] / scale)
xi_coord_arrays = [np.arange(start = scaled_min_bound[k], stop = scaled_max_bound[k]+1, step = 1) for k in range(dim)]
xi_vectors = cartesian_product(xi_coord_arrays) # all possible integer vectors to generate lattice points
lattice_points = scale * xi_vectors @ generating_matrix
# remove unnecessary lattice points:
pt_list = np.asarray(lattice_points)
scaled_cover_radius = scale * cover_radius * (1 + overlap_frac) # radius for balls at each lattice point
mask = np.all(np.logical_and(pt_list > left_limits - scaled_cover_radius, pt_list < right_limits + scaled_cover_radius), axis = 1)
return pt_list[mask], scaled_cover_radius
@staticmethod
def _cubical_lattice_cover_limits(self, left_limits, right_limits, dim, n_intervals, overlap_frac, special):
"""Construct scaled integer lattice :math:`\mathbb{Z}^n` and return
uniform hypercube bounds for the cover sets.
"""
# we don't need generating matrix since it is the identity
# generating_matrix = np.ma.identity(dim)
cover_radius = np.sqrt(dim)/2
assert left_limits.shape[0] == right_limits.shape[0] == dim, f'dim = {dim}, left {left_limits.shape}, right {right_limits.shape}'
bound_vector = np.abs(right_limits - left_limits)
scale = np.max([np.average(bound_vector/n_intervals), 1])
# create bounds for lattice points
scaled_min_bound = np.floor(left_limits/scale)
scaled_max_bound = np.ceil(right_limits/scale)
xi_coord_arrays = [np.arange(start = scaled_min_bound[k], stop = scaled_max_bound[k]+1, step = 1) for k in range(dim)]
xi_vectors = cartesian_product(xi_coord_arrays) # all possible integer vectors to generate lattice points
lattice_points = scale * xi_vectors
pt_list = np.asarray(lattice_points)
mask = np.all(np.logical_and(pt_list > left_limits - 0.5 * scale / (1 - overlap_frac), pt_list < right_limits + 0.5 * scale / (1 - overlap_frac)), axis = 1)
left_interval_limits = pt_list[mask] - 0.5 * scale / (1 - overlap_frac)
right_interval_limits = pt_list[mask] + 0.5 * scale / (1 - overlap_frac)
assert left_interval_limits.shape == right_interval_limits.shape
return pt_list[mask], left_interval_limits, right_interval_limits