Skip to content

Commit d79e46a

Browse files
authored
Merge pull request #2357 from jerneju/sparse-impute
[FIX] Impute: sparse
2 parents 38f42d8 + cf584f5 commit d79e46a

5 files changed

Lines changed: 81 additions & 35 deletions

File tree

Orange/preprocess/impute.py

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import numpy
2-
from scipy.sparse import issparse
1+
import numpy as np
2+
import scipy.sparse as sp
33

44
import Orange.data
55
from Orange.statistics import distribution, basic_stats
@@ -26,11 +26,11 @@ def __init__(self, variable, value=0):
2626
self.value = value
2727

2828
def transform(self, c):
29-
if issparse(c):
30-
c.data = numpy.where(numpy.isnan(c.data), self.value, c.data)
29+
if sp.issparse(c):
30+
c.data = np.where(np.isnan(c.data), self.value, c.data)
3131
return c
3232
else:
33-
return numpy.where(numpy.isnan(c), self.value, c)
33+
return np.where(np.isnan(c), self.value, c)
3434

3535

3636
class BaseImputeMethod(Reprable):
@@ -83,7 +83,7 @@ class DropInstances(BaseImputeMethod):
8383

8484
def __call__(self, data, variable):
8585
index = data.domain.index(variable)
86-
return numpy.isnan(data[:, index]).reshape(-1)
86+
return np.isnan(data[:, index]).reshape(-1)
8787

8888

8989
class Average(BaseImputeMethod):
@@ -154,13 +154,13 @@ def __init__(self, variable, model):
154154

155155
def __call__(self, data):
156156
if isinstance(data, Orange.data.Instance):
157-
column = numpy.array([float(data[self.variable])])
157+
column = np.array([float(data[self.variable])])
158158
else:
159-
column = numpy.array(data.get_column_view(self.variable)[0],
159+
column = np.array(data.get_column_view(self.variable)[0],
160160
copy=True)
161161

162-
mask = numpy.isnan(column)
163-
if not numpy.any(mask):
162+
mask = np.isnan(column)
163+
if not np.any(mask):
164164
return column
165165

166166
if isinstance(data, Orange.data.Instance):
@@ -224,7 +224,9 @@ def domain_with_class_var(domain, class_var):
224224

225225
class IsDefined(Transformation):
226226
def transform(self, c):
227-
return ~numpy.isnan(c)
227+
if sp.issparse(c):
228+
c = c.toarray()
229+
return ~np.isnan(c)
228230

229231

230232
class AsValue(BaseImputeMethod):
@@ -243,7 +245,7 @@ def __call__(self, data, variable):
243245
base_value=variable.base_value,
244246
compute_value=Lookup(
245247
variable,
246-
numpy.arange(len(variable.values), dtype=int),
248+
np.arange(len(variable.values), dtype=int),
247249
unknown=len(variable.values))
248250
)
249251
return var
@@ -281,29 +283,32 @@ def __init__(self, variable, distribution):
281283
self.distribution = distribution
282284

283285
if variable.is_discrete:
284-
counts = numpy.array(distribution)
286+
counts = np.array(distribution)
285287
elif variable.is_continuous:
286-
counts = numpy.array(distribution)[1, :]
288+
counts = np.array(distribution)[1, :]
287289
else:
288290
raise TypeError("Only discrete and continuous "
289291
"variables are supported")
290-
csum = numpy.sum(counts)
292+
csum = np.sum(counts)
291293
if csum > 0:
292294
self.sample_prob = counts / csum
293295
else:
294-
self.sample_prob = numpy.ones_like(counts) / len(counts)
296+
self.sample_prob = np.ones_like(counts) / len(counts)
295297

296298
def transform(self, c):
297-
c = numpy.array(c, copy=True)
298-
nanindices = numpy.flatnonzero(numpy.isnan(c))
299+
if not sp.issparse(c):
300+
c = np.array(c, copy=True)
301+
else:
302+
c = c.toarray().ravel()
303+
nanindices = np.flatnonzero(np.isnan(c))
299304

300305
if self.variable.is_discrete:
301-
sample = numpy.random.choice(
306+
sample = np.random.choice(
302307
len(self.variable.values), size=len(nanindices),
303308
replace=True, p=self.sample_prob)
304309
else:
305-
sample = numpy.random.choice(
306-
numpy.asarray(self.distribution)[0, :], size=len(nanindices),
310+
sample = np.random.choice(
311+
np.asarray(self.distribution)[0, :], size=len(nanindices),
307312
replace=True, p=self.sample_prob)
308313

309314
c[nanindices] = sample
@@ -328,9 +333,9 @@ def __call__(self, data, variable):
328333
raise ValueError("'{}' has an unknown distribution"
329334
.format(variable))
330335

331-
if variable.is_discrete and numpy.sum(dist) == 0:
336+
if variable.is_discrete and np.sum(dist) == 0:
332337
dist += 1 / len(dist)
333-
elif variable.is_continuous and numpy.sum(dist[1, :]) == 0:
338+
elif variable.is_continuous and np.sum(dist[1, :]) == 0:
334339
dist[1, :] += 1 / dist.shape[1]
335340
return variable.copy(
336341
compute_value=ReplaceUnknownsRandom(variable, dist))

Orange/statistics/util.py

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,9 @@ def weighted_mean():
216216
non_zero = np.bincount(X.nonzero()[1], minlength=X.shape[1])
217217
X = X.tocsc()
218218
return np.column_stack((
219-
X.min(axis=0).toarray().ravel(),
220-
X.max(axis=0).toarray().ravel(),
221-
np.asarray(X.mean(axis=0)).ravel() if not weighted else weighted_mean(),
219+
nanmin(X, axis=0),
220+
nanmax(X, axis=0),
221+
nanmean(X, axis=0) if not weighted else weighted_mean(),
222222
np.zeros(X.shape[1]), # variance not supported
223223
X.shape[0] - non_zero,
224224
non_zero))
@@ -280,15 +280,22 @@ def mean(x):
280280
n_values = np.prod(x.shape)
281281
return np.sum(x.data) / n_values
282282

283-
284-
def nanmean(x):
283+
def nanmean(x, axis=None):
285284
""" Equivalent of np.nanmean that supports sparse or dense matrices. """
286-
if not sp.issparse(x):
287-
return np.nanmean(x)
288-
289-
n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
290-
return np.nansum(x.data) / n_values
285+
def nanmean_sparse(x):
286+
n_values = np.prod(x.shape) - np.sum(np.isnan(x.data))
287+
return np.nansum(x.data) / n_values
291288

289+
if not sp.issparse(x):
290+
return np.nanmean(x, axis=axis)
291+
if axis is None:
292+
return nanmean_sparse(x)
293+
if axis in [0, 1]:
294+
arr = x if axis == 1 else x.T
295+
arr = arr.tocsr()
296+
return np.array([nanmean_sparse(row) for row in arr])
297+
else:
298+
raise NotImplementedError
292299

293300
def unique(x, return_counts=False):
294301
""" Equivalent of np.unique that supports sparse or dense matrices. """

Orange/tests/test_impute.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ def test_str(self):
158158

159159

160160
class TestAsValue(unittest.TestCase):
161-
def test_replacement(self):
161+
def _create_table(self):
162162
nan = np.nan
163163
X = [
164164
[1.0, nan, 0.0],
@@ -170,7 +170,11 @@ def test_replacement(self):
170170
data.ContinuousVariable("B"),
171171
data.ContinuousVariable("C"))
172172
)
173-
table = data.Table.from_numpy(domain, np.array(X))
173+
return data.Table.from_numpy(domain, np.array(X))
174+
175+
def test_replacement(self):
176+
table = self._create_table()
177+
domain = table.domain
174178

175179
v1 = impute.AsValue()(table, domain[0])
176180
self.assertTrue(np.all(np.isfinite(v1.compute_value(table))))
@@ -200,6 +204,20 @@ def test_replacement(self):
200204
[3, 1.0, 0, 1.5, 0]]
201205
)
202206

207+
def test_sparse(self):
208+
"""
209+
Impute: As a distinct value test. Sparse support.
210+
GH-2357
211+
"""
212+
table = self._create_table()
213+
domain = table.domain
214+
table.X = sp.csr_matrix(table.X)
215+
216+
v1, v2 = impute.AsValue()(table, domain[1])
217+
self.assertTrue(np.all(np.isfinite(v2.compute_value(table))))
218+
self.assertEqual([v2.str_val(v) for v in v2.compute_value(table)],
219+
["undef", "def", "undef"])
220+
203221

204222
class TestModel(unittest.TestCase):
205223
def test_replacement(self):

Orange/tests/test_util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77

88
from Orange.util import export_globals, flatten, deprecated, try_, deepgetattr, \
99
OrangeDeprecationWarning
10+
from Orange.data import Table
1011
from Orange.data.util import vstack, hstack
12+
from Orange.statistics.util import stats
1113

1214
SOMETHING = 0xf00babe
1315

@@ -106,3 +108,12 @@ def assertCorrectArrayType(self, array, shape, sparsity):
106108
def test_raise_deprecations(self):
107109
with self.assertRaises(OrangeDeprecationWarning):
108110
warnings.warn('foo', OrangeDeprecationWarning)
111+
112+
def test_stats_sparse(self):
113+
"""
114+
Stats should not fail when trying to calculate mean on sparse data.
115+
GH-2357
116+
"""
117+
data = Table("iris")
118+
sparse_x = sp.csr_matrix(data.X)
119+
self.assertTrue(stats(data.X).all() == stats(sparse_x).all())

Orange/widgets/data/owimpute.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ class OWImpute(OWWidget):
6565

6666
class Error(OWWidget.Error):
6767
imputation_failed = Msg("Imputation failed for '{}'")
68+
model_based_imputer_sparse = Msg("Model based imputer does not work for sparse data")
6869

6970
DEFAULT_LEARNER = SimpleTreeLearner()
7071
METHODS = [AsDefault(), impute.DoNotImpute(), impute.Average(),
@@ -258,9 +259,13 @@ def commit(self):
258259

259260
self.warning()
260261
self.Error.imputation_failed.clear()
262+
self.Error.model_based_imputer_sparse.clear()
261263
with self.progressBar(len(self.varmodel)) as progress:
262264
for i, var in enumerate(self.varmodel):
263265
method = self.variable_methods.get(i, self.default_method)
266+
if isinstance(method, impute.Model) and data.is_sparse():
267+
self.Error.model_based_imputer_sparse()
268+
continue
264269

265270
try:
266271
if not method.supports_variable(var):

0 commit comments

Comments
 (0)