Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 69 additions & 10 deletions Orange/classification/naive_bayes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import numpy as np
import scipy.sparse as sp

from Orange.classification import Learner, Model
from Orange.data import Instance, Storage
from Orange.data import Instance, Storage, Table
from Orange.statistics import contingency
from Orange.preprocess import Discretize, RemoveNaNColumns

Expand Down Expand Up @@ -48,22 +49,80 @@ def __init__(self, log_cont_prob, class_prob, domain):

def predict_storage(self, data):
if isinstance(data, Instance):
data = [data]
if len(data.domain.attributes) == 0:
data = Table(np.atleast_2d(data.x))

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I somehow thought I've done this correctly. (But I have no good explanation for assert_not_called in tests. :))

if type(data) is Table: # pylint: disable=unidiomatic-typecheck
return self.predict(data.X)

if not len(data) or not len(data[0]):
Comment thread
markotoplak marked this conversation as resolved.
probs = np.tile(self.class_prob, (len(data), 1))
else:
isnan = np.isnan
probs = np.exp(
zeros = np.zeros_like(self.class_prob)
probs = np.atleast_2d(np.exp(
np.log(self.class_prob) +
np.array([np.zeros_like(self.class_prob)
if isnan(ins.x).all() else
np.sum(attr_prob[:, int(attr_val)]
for attr_val, attr_prob in zip(ins, self.log_cont_prob)
if not isnan(attr_val))
for ins in data]))
np.array([
zeros if isnan(ins.x).all() else
sum(attr_prob[:, int(attr_val)]
Comment thread
markotoplak marked this conversation as resolved.
for attr_val, attr_prob in zip(ins, self.log_cont_prob)
if not isnan(attr_val))
for ins in data])))
probs /= probs.sum(axis=1)[:, None]
values = probs.argmax(axis=1)
return values, probs

def predict(self, X):
if not self.log_cont_prob:
probs = self._priors(X)
elif sp.issparse(X):
probs = self._sparse_probs(X)
else:
probs = self._dense_probs(X)
probs = np.exp(probs)
probs /= probs.sum(axis=1)[:, None]
values = probs.argmax(axis=1)
return values, probs

def _priors(self, data):
return np.tile(np.log(self.class_prob), (data.shape[0], 1))

def _dense_probs(self, data):
probs = self._priors(data)
zeros = np.zeros((1, probs.shape[1]))
for col, attr_prob in zip(data.T, self.log_cont_prob):
col = col.copy()
col[np.isnan(col)] = attr_prob.shape[1] - 1
col = col.astype(int)
probs0 = np.vstack((attr_prob.T, zeros))
probs += probs0[col]
return probs

def _sparse_probs(self, data):
probs = self._priors(data)

n_vals = max(p.shape[1] for p in self.log_cont_prob) + 1
log_prob = np.zeros((len(self.log_cont_prob),
n_vals,
self.log_cont_prob[0].shape[0]))
for i, p in enumerate(self.log_cont_prob):
p0 = p.T[0].copy()
probs[:] += p0
log_prob[i, :p.shape[1]] = p.T - p0

dat = data.data.copy()
dat[np.isnan(dat)] = n_vals - 1
dat = dat.astype(int)

if sp.isspmatrix_csr(data):
for row, start, end in zip(probs, data.indptr, data.indptr[1:]):
row += log_prob[data.indices[start:end],
dat[start:end]].sum(axis=0)
else:
csc = data.tocsc()
for start, end, attr_prob in zip(csc.indptr, csc.indptr[1:],
log_prob):
probs[csc.indices[start:end]] += attr_prob[dat[start:end]]

return probs


NaiveBayesLearner.__returns__ = NaiveBayesModel
195 changes: 180 additions & 15 deletions Orange/tests/test_naive_bayes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,39 +2,42 @@
# pylint: disable=missing-docstring

import unittest
import warnings
from unittest.mock import Mock

import numpy as np
import scipy.sparse as sp

from Orange.classification import NaiveBayesLearner
from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
from Orange.evaluation import CrossValidation, CA


# This class is used to force predict_storage to fall back to the slower
# procedure instead of calling `predict`
class NotATable(Table): # pylint: disable=too-many-ancestors,abstract-method
pass


class TestNaiveBayesLearner(unittest.TestCase):
@classmethod
def setUpClass(cls):
data = Table('titanic')
cls.data = data = Table('titanic')
cls.learner = NaiveBayesLearner()
cls.model = cls.learner(data)
cls.table = data[::20]

def setUp(self):
self.model = self.learner(self.data)

def test_NaiveBayes(self):
results = CrossValidation(self.table, [self.learner], k=10)
ca = CA(results)
self.assertGreater(ca, 0.7)
self.assertLess(ca, 0.9)

def test_predict_single_instance(self):
Comment thread
markotoplak marked this conversation as resolved.
for ins in self.table:
self.model(ins)
val, prob = self.model(ins, self.model.ValueProbs)

def test_predict_table(self):
self.model(self.table)
vals, probs = self.model(self.table, self.model.ValueProbs)

def test_predict_numpy(self):
X = self.table.X[::20]
self.model(X)
vals, probs = self.model(X, self.model.ValueProbs)
results = CrossValidation(Table("iris"), [self.learner], k=10)
ca = CA(results)
self.assertGreater(ca, 0.7)

def test_degenerate(self):
d = Domain((ContinuousVariable(name="A"),
Expand All @@ -53,3 +56,165 @@ def test_allnan_cv(self):
data = Table('voting')
results = CrossValidation(data, [self.learner])
self.assertFalse(any(results.failed))

def test_prediction_routing(self):
data = self.data
predict = self.model.predict = Mock(return_value=(data.Y, None))

self.model(data)
predict.assert_called()
predict.reset_mock()

self.model(data.X)
predict.assert_called()
predict.reset_mock()

self.model.predict_storage(data)
predict.assert_called()
predict.reset_mock()

self.model.predict_storage(data[0])
predict.assert_called()

def test_compare_results_of_predict_and_predict_storage(self):
data2 = NotATable("titanic")

self.model = self.learner(self.data[:50])
predict = self.model.predict = Mock(side_effect=self.model.predict)
values, probs = self.model.predict_storage(self.data[50:])
predict.assert_called()
predict.reset_mock()
values2, probs2 = self.model.predict_storage(data2[50:])
predict.assert_not_called()

np.testing.assert_equal(values, values2)
np.testing.assert_equal(probs, probs2)

def test_predictions(self):
self._test_predictions(sparse=None)

def test_predictions_csr_matrix(self):
with warnings.catch_warnings():
Comment thread
markotoplak marked this conversation as resolved.
warnings.filterwarnings(
"ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
self._test_predictions(sparse=sp.csr_matrix)

def test_predictions_csc_matrix(self):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", ".*the matrix subclass.*", PendingDeprecationWarning)
self._test_predictions(sparse=sp.csc_matrix)

def _test_predictions(self, sparse):
x = np.array([
[1, 0, 0],
[0, np.nan, 0],
[0, 1, 0],
[0, 0, 0],
[1, 2, 0],
[1, 1, 0],
[1, 2, 0],
[0, 1, 0]])
if sparse is not None:
x = sparse(x)

y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
domain = Domain(
[DiscreteVariable("a", values="ab"),
DiscreteVariable("b", values="abc"),
DiscreteVariable("c", values="a")],
DiscreteVariable("y", values="abc"))
data = Table.from_numpy(domain, x, y)

model = self.learner(data)
np.testing.assert_almost_equal(
model.class_prob,
[4/11, 4/11, 3/11]
)
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[0]) * model.class_prob[:, None],
[[3/7, 2/7], [2/7, 3/7], [2/7, 2/7]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[1]) * model.class_prob[:, None],
[[2/5, 1/3, 1/5], [2/5, 1/3, 2/5], [1/5, 1/3, 2/5]])
np.testing.assert_almost_equal(
np.exp(model.log_cont_prob[2]) * model.class_prob[:, None],
[[4/11], [4/11], [3/11]])

test_x = np.array([[a, b, 0] for a in [0, 1] for b in [0, 1, 2]])
# Classifiers reject csc matrices in the base class
# Naive bayesian classifier supports them if predict_storage is
# called directly, which we do below
if sparse is not None and sparse is not sp.csc_matrix:
test_x = sparse(test_x)
test_y = np.full((6, ), np.nan)
# The following was computed manually, too
exp_probs = np.array([
[0.47368421052632, 0.31578947368421, 0.21052631578947],
[0.39130434782609, 0.26086956521739, 0.34782608695652],
[0.24324324324324, 0.32432432432432, 0.43243243243243],
[0.31578947368421, 0.47368421052632, 0.21052631578947],
[0.26086956521739, 0.39130434782609, 0.34782608695652],
[0.15000000000000, 0.45000000000000, 0.40000000000000]
])

# Test the faster algorithm for Table (numpy matrices)
test_data = Table.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test the slower algorithm for non-Table data (iteration in Python)
test_data = NotATable.from_numpy(domain, test_x, test_y)
probs = model(test_data, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_data)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_data, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test prediction directly on numpy
probs = model(test_x, ret=model.Probs)
np.testing.assert_almost_equal(exp_probs, probs)
values = model(test_x)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))
values, probs = model(test_x, ret=model.ValueProbs)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

# Test prediction on instances
for inst, exp_prob in zip(test_data, exp_probs):
np.testing.assert_almost_equal(
model(inst, ret=model.Probs)[0],
exp_prob)
self.assertEqual(model(inst), np.argmax(exp_prob))
value, prob = model(inst, ret=model.ValueProbs)
np.testing.assert_almost_equal(prob[0], exp_prob)
self.assertEqual(value, np.argmax(exp_prob))

# Test prediction by directly calling predict. This is needed to test
# csc_matrix, but doesn't hurt others
if sparse is sp.csc_matrix:
test_x = sparse(test_x)
values, probs = model.predict(test_x)
np.testing.assert_almost_equal(exp_probs, probs)
np.testing.assert_equal(values, np.argmax(exp_probs, axis=1))

def test_no_attributes(self):
y = np.array([0, 0, 0, 1, 1, 1, 2, 2])
domain = Domain([], DiscreteVariable("y", values="abc"))
data = Table.from_numpy(domain, np.zeros((len(y), 0)), y.T)
model = self.learner(data)
np.testing.assert_almost_equal(
model.predict_storage(np.zeros((5, 0)))[1],
[[4/11, 4/11, 3/11]] * 5
)


if __name__ == "__main__":
unittest.main()