Skip to content

Commit 6d9a3ca

Browse files
owtsne: Fix crash on data containing NaNs, add sparse
1 parent fc33050 commit 6d9a3ca

File tree

3 files changed

+137
-26
lines changed

3 files changed

+137
-26
lines changed

Orange/projection/manifold.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from itertools import chain
77

88
import numpy as np
9-
import scipy.sparse as sp
109
from scipy.linalg import eigh as lapack_eigh
1110
from scipy.sparse.linalg import eigsh as arpack_eigh
1211
import sklearn.manifold as skl_manifold
@@ -233,11 +232,6 @@ def proj_variable(i):
233232
metas=table.domain.metas)
234233

235234
def transform(self, X: np.ndarray, learning_rate=1, **kwargs) -> openTSNE.PartialTSNEEmbedding:
236-
if sp.issparse(X):
237-
raise TypeError(
238-
"A sparse matrix was passed, but dense data is required. Use "
239-
"X.toarray() to convert to a dense numpy array."
240-
)
241235
if isinstance(self.embedding_.affinities, openTSNE.affinity.Multiscale):
242236
perplexity = kwargs.pop("perplexity", False)
243237
if perplexity:
@@ -415,12 +409,6 @@ def __init__(self, n_components=2, perplexity=30, learning_rate="auto",
415409
self.random_state = random_state
416410

417411
def compute_affinities(self, X):
418-
# Sparse data are not supported
419-
if sp.issparse(X):
420-
raise TypeError(
421-
"A sparse matrix was passed, but dense data is required. Use "
422-
"X.toarray() to convert to a dense numpy array."
423-
)
424412

425413
# Build up the affinity matrix, using multiscale if needed
426414
if self.multiscale:

Orange/widgets/unsupervised/owtsne.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ def error(msg):
9898
"distance matrix is provided"
9999
)
100100

101+
if self.data is not None and self.data.is_sparse():
102+
if self.normalize:
103+
error("Data normalization is not supported for sparse data")
104+
101105
return self
102106

103107

@@ -445,6 +449,23 @@ def __init__(self):
445449
self.tsne_embedding = None # type: Optional[manifold.TSNEModel]
446450
self.iterations_done = 0 # type: int
447451

452+
@property
453+
def normalize_(self):
454+
should_normalize = self.normalize
455+
if self.distance_matrix is not None:
456+
should_normalize = False
457+
if self.data is not None:
458+
if self.data.is_sparse():
459+
should_normalize = False
460+
return should_normalize
461+
462+
@property
463+
def use_pca_preprocessing_(self):
464+
should_use_pca_preprocessing = self.use_pca_preprocessing
465+
if self.distance_matrix is not None:
466+
should_use_pca_preprocessing = False
467+
return should_use_pca_preprocessing
468+
448469
@property
449470
def effective_data(self):
450471
return self.data.transform(Domain(self.effective_variables))
@@ -457,7 +478,7 @@ def _add_controls_start_box(self):
457478
self.preprocessing_box = gui.vBox(self.controlArea, box="Preprocessing")
458479
self.normalize_cbx = gui.checkBox(
459480
self.preprocessing_box, self, "normalize", "Normalize data",
460-
callback=self._invalidate_normalized_data, stateWhenDisabled=False,
481+
callback=self._normalize_data_changed, stateWhenDisabled=False,
461482
)
462483
self.pca_preprocessing_cbx = gui.checkBox(
463484
self.preprocessing_box, self, "use_pca_preprocessing", "Apply PCA preprocessing",
@@ -519,10 +540,10 @@ def _add_controls_start_box(self):
519540
# GUI control callbacks
520541
def _normalize_data_changed(self):
521542
# We only care about the normalization checkbox if there is no distance
522-
# matrix provided. This is not user-settable anyway, but is triggered
523-
# when we programmatically enable/disable the checkbox in
524-
# `enable_controls`
525-
if self.distance_matrix is None:
543+
# matrix provided and if the data are not sparse. This is not user-
544+
# settable anyway, but is triggered when we programmatically
545+
# enable/disable the checkbox in`enable_controls`
546+
if self.distance_matrix is None and not self.data.is_sparse():
526547
self._invalidate_normalized_data()
527548

528549
def _pca_preprocessing_changed(self):
@@ -856,7 +877,7 @@ def enable_controls(self):
856877
)
857878

858879
# Disable slider parent, because we want to disable the labels too
859-
self.pca_component_slider.parent().setEnabled(self.use_pca_preprocessing)
880+
self.pca_component_slider.parent().setEnabled(self.use_pca_preprocessing_)
860881

861882
# Disable the perplexity spin box if multiscale is turned on
862883
self.perplexity_spin.setDisabled(self.multiscale)
@@ -904,10 +925,10 @@ def run(self):
904925
# Preprocessed data
905926
preprocessed_data=self.preprocessed_data,
906927
# Normalization
907-
normalize=self.normalize,
928+
normalize=self.normalize_,
908929
normalized_data=self.normalized_data,
909930
# PCA preprocessing
910-
use_pca_preprocessing=self.use_pca_preprocessing,
931+
use_pca_preprocessing=self.use_pca_preprocessing_,
911932
pca_components=self.pca_components,
912933
pca_projection=self.pca_projection,
913934
# t-SNE parameters
@@ -931,14 +952,14 @@ def __ensure_task_same_for_preprocessing(self, task: Task):
931952
len(task.preprocessed_data) == len(self.data)
932953

933954
def __ensure_task_same_for_normalization(self, task: Task):
934-
assert task.normalize == self.normalize
955+
assert task.normalize == self.normalize_
935956
if task.normalize and task.distance_metric != "precomputed":
936957
assert task.data is self.data
937958
assert isinstance(task.normalized_data, Table) and \
938959
len(task.normalized_data) == len(self.data)
939960

940961
def __ensure_task_same_for_pca(self, task: Task):
941-
assert task.use_pca_preprocessing == self.use_pca_preprocessing
962+
assert task.use_pca_preprocessing == self.use_pca_preprocessing_
942963
if task.use_pca_preprocessing and task.distance_metric != "precomputed":
943964
assert task.data is self.data
944965
assert task.pca_components == self.pca_components

Orange/widgets/unsupervised/tests/test_owtsne.py

Lines changed: 106 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -350,14 +350,23 @@ def test_invalidation_flow(self):
350350
# set global structure "on" (after the embedding is computed)
351351
w.controls.multiscale.setChecked(False)
352352
self.send_signal(w.Inputs.data, self.data)
353+
354+
# By default, t-SNE is smart and disables PCA preprocessing if the
355+
# number of features is too low. Since we are testing with the iris
356+
# data set, we want to force t-SNE to use PCA preprocessing.
357+
w.controls.use_pca_preprocessing.setChecked(True)
358+
self.widget.run_button.click()
359+
353360
self.wait_until_finished()
354361
self.assertFalse(self.widget.Information.modified.is_shown())
355362
# All the embedding components should be computed
363+
self.assertIsNotNone(w.preprocessed_data)
356364
self.assertIsNotNone(w.normalized_data)
357365
self.assertIsNotNone(w.pca_projection)
358366
self.assertIsNotNone(w.affinities)
359367
self.assertIsNotNone(w.tsne_embedding)
360368
# All the invalidation flags should be set to false
369+
self.assertFalse(w._invalidated.preprocessed_data)
361370
self.assertFalse(w._invalidated.normalized_data)
362371
self.assertFalse(w._invalidated.pca_projection)
363372
self.assertFalse(w._invalidated.affinities)
@@ -368,13 +377,15 @@ def test_invalidation_flow(self):
368377
self.assertTrue(self.widget.Information.modified.is_shown())
369378
# Setting `multiscale` to true should set the invalidate flags for
370379
# the affinities and embedding, but not the pca_projection
380+
self.assertFalse(w._invalidated.preprocessed_data)
371381
self.assertFalse(w._invalidated.normalized_data)
372382
self.assertFalse(w._invalidated.pca_projection)
373383
self.assertTrue(w._invalidated.affinities)
374384
self.assertTrue(w._invalidated.tsne_embedding)
375385

376386
# The flags should now be set, but the embedding should still be
377387
# available when selecting a subset of data and such
388+
self.assertIsNotNone(w.preprocessed_data)
378389
self.assertIsNotNone(w.normalized_data)
379390
self.assertIsNotNone(w.pca_projection)
380391
self.assertIsNotNone(w.affinities)
@@ -472,6 +483,9 @@ def test_distance_matrix_not_symmetric(self):
472483
self.send_signal(w.Inputs.distances, DistMatrix([[1, 2, 3], [4, 5, 6]]))
473484
self.assertTrue(w.Error.distance_matrix_not_symmetric.is_shown())
474485

486+
self.send_signal(w.Inputs.distances, DistMatrix([[1, 2, 3], [4, 5, 6], [7, 8, 9]]))
487+
self.assertTrue(w.Error.distance_matrix_not_symmetric.is_shown())
488+
475489
self.send_signal(w.Inputs.distances, None)
476490
self.assertFalse(w.Error.distance_matrix_not_symmetric.is_shown())
477491

@@ -813,6 +827,64 @@ def test_controls_ignored_by_distance_matrix_retain_values_on_table_signal(self)
813827
self.assertTrue(w.perplexity_spin.isEnabled())
814828
self.assertEqual(w.perplexity_spin.value(), 42)
815829

830+
def test_controls_are_properly_disabled_with_sparse_matrix(self):
831+
w = self.widget
832+
833+
# Normalizing sparse matrix is disabled, since this would require
834+
# centering
835+
disabled_fields = ["normalize"]
836+
# PCA preprocessing and supported distance metrics are enable for sparse
837+
# matrices
838+
enabled_fields = [
839+
"use_pca_preprocessing", "distance_metric_idx", "initialization_method_idx"
840+
]
841+
842+
self.send_signal(w.Inputs.data, self.iris.to_sparse())
843+
self.wait_until_finished()
844+
845+
for field in disabled_fields:
846+
self.assertFalse(getattr(w.controls, field).isEnabled())
847+
for field in enabled_fields:
848+
self.assertTrue(getattr(w.controls, field).isEnabled())
849+
850+
# Send dense table, shoule enable disabled fields
851+
self.send_signal(w.Inputs.data, self.iris)
852+
self.wait_until_finished()
853+
854+
for field in disabled_fields:
855+
self.assertTrue(getattr(w.controls, field).isEnabled())
856+
for field in enabled_fields:
857+
self.assertTrue(getattr(w.controls, field).isEnabled())
858+
859+
def test_data_containing_nans(self):
860+
x = np.random.normal(0, 1, size=(150, 50))
861+
# Randomly sprinkle a few NaNs into the matrix
862+
num_nans = 20
863+
x[np.random.randint(0, 150, num_nans), np.random.randint(0, 50, num_nans)] = np.nan
864+
865+
nan_data = Table.from_numpy(Domain.from_numpy(x), x)
866+
867+
w = self.widget
868+
869+
self.send_signal(w.Inputs.data, nan_data)
870+
self.assertTrue(w.controls.normalize.isChecked())
871+
self.assertTrue(w.controls.use_pca_preprocessing.isChecked())
872+
self.widget.run_button.click(), self.wait_until_finished()
873+
874+
# Disable only normalization
875+
w.controls.normalize.setChecked(False)
876+
self.widget.run_button.click(), self.wait_until_finished()
877+
878+
# Disable only PCA preprocessing
879+
w.controls.normalize.setChecked(True)
880+
w.controls.use_pca_preprocessing.setChecked(False)
881+
self.widget.run_button.click(), self.wait_until_finished()
882+
883+
# Disable both normalization and PCA preprocessing
884+
w.controls.normalize.setChecked(False)
885+
w.controls.use_pca_preprocessing.setChecked(False)
886+
self.widget.run_button.click(), self.wait_until_finished()
887+
816888

817889
class TestTSNERunner(unittest.TestCase):
818890
@classmethod
@@ -834,8 +906,9 @@ def test_run_with_normalization_and_pca_preprocessing(self):
834906
)
835907
task = TSNERunner.run(task, state)
836908

837-
self.assertEqual(len(state.set_status.mock_calls), 5)
909+
self.assertEqual(len(state.set_status.mock_calls), 6)
838910
state.set_status.assert_has_calls([
911+
call("Preprocessing data..."),
839912
call("Normalizing data..."),
840913
call("Computing PCA..."),
841914
call("Finding nearest neighbors..."),
@@ -862,8 +935,9 @@ def test_run_with_normalization(self):
862935
)
863936
task = TSNERunner.run(task, state)
864937

865-
self.assertEqual(len(state.set_status.mock_calls), 4)
938+
self.assertEqual(len(state.set_status.mock_calls), 5)
866939
state.set_status.assert_has_calls([
940+
call("Preprocessing data..."),
867941
call("Normalizing data..."),
868942
call("Finding nearest neighbors..."),
869943
call("Preparing initialization..."),
@@ -890,8 +964,9 @@ def test_run_with_pca_preprocessing(self):
890964
)
891965
task = TSNERunner.run(task, state)
892966

893-
self.assertEqual(len(state.set_status.mock_calls), 4)
967+
self.assertEqual(len(state.set_status.mock_calls), 5)
894968
state.set_status.assert_has_calls([
969+
call("Preprocessing data..."),
895970
call("Computing PCA..."),
896971
call("Finding nearest neighbors..."),
897972
call("Preparing initialization..."),
@@ -949,7 +1024,6 @@ def test_run_with_distance_matrix(self):
9491024
task = Task(
9501025
normalize=False,
9511026
use_pca_preprocessing=False,
952-
# data=self.data,
9531027
distance_matrix=self.distances,
9541028
perplexity=30,
9551029
initialization_method="spectral",
@@ -1064,6 +1138,34 @@ def test_run_with_distance_matrix_ignores_preprocessing(self):
10641138
self.assertIsInstance(task.tsne, TSNE)
10651139
self.assertIsInstance(task.tsne_embedding, TSNEModel)
10661140

1141+
def test_run_with_sparse_matrix_ignores_normalization(self):
1142+
state = Mock()
1143+
state.is_interruption_requested = Mock(return_value=False)
1144+
1145+
task = Task(
1146+
normalize=False,
1147+
use_pca_preprocessing=True,
1148+
data=self.data.to_sparse(),
1149+
perplexity=30,
1150+
initialization_method="spectral",
1151+
distance_metric="cosine",
1152+
)
1153+
task = TSNERunner.run(task, state)
1154+
self.assertEqual(len(state.set_status.mock_calls), 5)
1155+
state.set_status.assert_has_calls([
1156+
call("Preprocessing data..."),
1157+
call("Computing PCA..."),
1158+
call("Finding nearest neighbors..."),
1159+
call("Preparing initialization..."),
1160+
call("Running optimization..."),
1161+
])
1162+
1163+
self.assertIsNone(task.normalized_data)
1164+
self.assertIsInstance(task.pca_projection, Table)
1165+
self.assertIsInstance(task.initialization, np.ndarray)
1166+
self.assertIsInstance(task.tsne, TSNE)
1167+
self.assertIsInstance(task.tsne_embedding, TSNEModel)
1168+
10671169

10681170
if __name__ == "__main__":
10691171
unittest.main()

0 commit comments

Comments
 (0)