|
2 | 2 | Base and utility classes for pandas objects.
|
3 | 3 | """
|
4 | 4 | from pandas import compat
|
| 5 | +from pandas.compat import builtins |
5 | 6 | import numpy as np
|
6 | 7 | from pandas.core import common as com
|
7 | 8 | import pandas.core.nanops as nanops
|
@@ -218,6 +219,266 @@ def __delete__(self, instance):
|
218 | 219 | raise AttributeError("can't delete attribute")
|
219 | 220 |
|
220 | 221 |
|
| 222 | +class GroupByError(Exception): |
| 223 | + pass |
| 224 | + |
| 225 | + |
| 226 | +class DataError(GroupByError): |
| 227 | + pass |
| 228 | + |
| 229 | + |
| 230 | +class SpecificationError(GroupByError): |
| 231 | + pass |
| 232 | + |
| 233 | + |
| 234 | +class SelectionMixin(object): |
| 235 | + """ |
| 236 | + mixin implementing the selection & aggregation interface on a group-like object |
| 237 | + sub-classes need to define: obj, exclusions |
| 238 | + """ |
| 239 | + _selection = None |
| 240 | + _internal_names = ['_cache'] |
| 241 | + _internal_names_set = set(_internal_names) |
| 242 | + _builtin_table = { |
| 243 | + builtins.sum: np.sum, |
| 244 | + builtins.max: np.max, |
| 245 | + builtins.min: np.min, |
| 246 | + } |
| 247 | + _cython_table = { |
| 248 | + builtins.sum: 'sum', |
| 249 | + builtins.max: 'max', |
| 250 | + builtins.min: 'min', |
| 251 | + np.sum: 'sum', |
| 252 | + np.mean: 'mean', |
| 253 | + np.prod: 'prod', |
| 254 | + np.std: 'std', |
| 255 | + np.var: 'var', |
| 256 | + np.median: 'median', |
| 257 | + np.max: 'max', |
| 258 | + np.min: 'min', |
| 259 | + np.cumprod: 'cumprod', |
| 260 | + np.cumsum: 'cumsum' |
| 261 | + } |
| 262 | + |
| 263 | + @property |
| 264 | + def name(self): |
| 265 | + if self._selection is None: |
| 266 | + return None # 'result' |
| 267 | + else: |
| 268 | + return self._selection |
| 269 | + |
| 270 | + @property |
| 271 | + def _selection_list(self): |
| 272 | + if not isinstance(self._selection, (list, tuple, com.ABCSeries, com.ABCIndex, np.ndarray)): |
| 273 | + return [self._selection] |
| 274 | + return self._selection |
| 275 | + |
| 276 | + @cache_readonly |
| 277 | + def _selected_obj(self): |
| 278 | + |
| 279 | + if self._selection is None or isinstance(self.obj, com.ABCSeries): |
| 280 | + return self.obj |
| 281 | + else: |
| 282 | + return self.obj[self._selection] |
| 283 | + |
| 284 | + @cache_readonly |
| 285 | + def _obj_with_exclusions(self): |
| 286 | + if self._selection is not None and isinstance(self.obj, com.ABCDataFrame): |
| 287 | + return self.obj.reindex(columns=self._selection_list) |
| 288 | + |
| 289 | + if len(self.exclusions) > 0: |
| 290 | + return self.obj.drop(self.exclusions, axis=1) |
| 291 | + else: |
| 292 | + return self.obj |
| 293 | + |
| 294 | + def __getitem__(self, key): |
| 295 | + if self._selection is not None: |
| 296 | + raise Exception('Column(s) %s already selected' % self._selection) |
| 297 | + |
| 298 | + if isinstance(key, (list, tuple, com.ABCSeries, com.ABCIndex, np.ndarray)): |
| 299 | + if len(self.obj.columns.intersection(key)) != len(key): |
| 300 | + bad_keys = list(set(key).difference(self.obj.columns)) |
| 301 | + raise KeyError("Columns not found: %s" |
| 302 | + % str(bad_keys)[1:-1]) |
| 303 | + return self._gotitem(list(key), ndim=2) |
| 304 | + |
| 305 | + elif not getattr(self,'as_index',False): |
| 306 | + if key not in self.obj.columns: |
| 307 | + raise KeyError("Column not found: %s" % key) |
| 308 | + return self._gotitem(key, ndim=2) |
| 309 | + |
| 310 | + else: |
| 311 | + if key not in self.obj: |
| 312 | + raise KeyError("Column not found: %s" % key) |
| 313 | + return self._gotitem(key, ndim=1) |
| 314 | + |
| 315 | + def _gotitem(self, key, ndim, subset=None): |
| 316 | + """ |
| 317 | + sub-classes to define |
| 318 | + return a sliced object |
| 319 | +
|
| 320 | + Parameters |
| 321 | + ---------- |
| 322 | + key : string / list of selections |
| 323 | + ndim : 1,2 |
| 324 | + requested ndim of result |
| 325 | + subset : object, default None |
| 326 | + subset to act on |
| 327 | +
|
| 328 | + """ |
| 329 | + raise AbstractMethodError(self) |
| 330 | + |
| 331 | + _agg_doc = """Aggregate using input function or dict of {column -> function} |
| 332 | +
|
| 333 | +Parameters |
| 334 | +---------- |
| 335 | +arg : function or dict |
| 336 | + Function to use for aggregating groups. If a function, must either |
| 337 | + work when passed a DataFrame or when passed to DataFrame.apply. If |
| 338 | + passed a dict, the keys must be DataFrame column names. |
| 339 | +
|
| 340 | + Accepted Combinations are: |
| 341 | + - string cythonized function name |
| 342 | + - function |
| 343 | + - list of functions |
| 344 | + - dict of columns -> functions |
| 345 | + - nested dict of names -> dicts of functions |
| 346 | +
|
| 347 | +Notes |
| 348 | +----- |
| 349 | +Numpy functions mean/median/prod/sum/std/var are special cased so the |
| 350 | +default behavior is applying the function along axis=0 |
| 351 | +(e.g., np.mean(arr_2d, axis=0)) as opposed to |
| 352 | +mimicking the default Numpy behavior (e.g., np.mean(arr_2d)). |
| 353 | +
|
| 354 | +Returns |
| 355 | +------- |
| 356 | +aggregated : DataFrame |
| 357 | +""" |
| 358 | + |
| 359 | + @Appender(_agg_doc) |
| 360 | + def agg(self, func, *args, **kwargs): |
| 361 | + return self.aggregate(func, *args, **kwargs) |
| 362 | + |
| 363 | + @Appender(_agg_doc) |
| 364 | + def aggregate(self, func, *args, **kwargs): |
| 365 | + raise AbstractMethodError(self) |
| 366 | + |
| 367 | + def _aggregate(self, arg, *args, **kwargs): |
| 368 | + """ |
| 369 | + provide an implementation for the aggregators |
| 370 | +
|
| 371 | + Returns |
| 372 | + ------- |
| 373 | + tuple of result, how |
| 374 | +
|
| 375 | + Notes |
| 376 | + ----- |
| 377 | + how can be a string describe the required post-processing, or |
| 378 | + None if not required |
| 379 | + """ |
| 380 | + |
| 381 | + if isinstance(arg, compat.string_types): |
| 382 | + return getattr(self, arg)(*args, **kwargs), None |
| 383 | + |
| 384 | + result = compat.OrderedDict() |
| 385 | + if isinstance(arg, dict): |
| 386 | + if self.axis != 0: # pragma: no cover |
| 387 | + raise ValueError('Can only pass dict with axis=0') |
| 388 | + |
| 389 | + obj = self._selected_obj |
| 390 | + |
| 391 | + if any(isinstance(x, (list, tuple, dict)) for x in arg.values()): |
| 392 | + new_arg = compat.OrderedDict() |
| 393 | + for k, v in compat.iteritems(arg): |
| 394 | + if not isinstance(v, (tuple, list, dict)): |
| 395 | + new_arg[k] = [v] |
| 396 | + else: |
| 397 | + new_arg[k] = v |
| 398 | + arg = new_arg |
| 399 | + |
| 400 | + keys = [] |
| 401 | + if self._selection is not None: |
| 402 | + subset = obj |
| 403 | + |
| 404 | + for fname, agg_how in compat.iteritems(arg): |
| 405 | + colg = self._gotitem(self._selection, ndim=1, subset=subset) |
| 406 | + result[fname] = colg.aggregate(agg_how) |
| 407 | + keys.append(fname) |
| 408 | + else: |
| 409 | + for col, agg_how in compat.iteritems(arg): |
| 410 | + colg = self._gotitem(col, ndim=1) |
| 411 | + result[col] = colg.aggregate(agg_how) |
| 412 | + keys.append(col) |
| 413 | + |
| 414 | + if isinstance(list(result.values())[0], com.ABCDataFrame): |
| 415 | + from pandas.tools.merge import concat |
| 416 | + result = concat([result[k] for k in keys], keys=keys, axis=1) |
| 417 | + else: |
| 418 | + from pandas import DataFrame |
| 419 | + result = DataFrame(result) |
| 420 | + |
| 421 | + return result, True |
| 422 | + elif hasattr(arg, '__iter__'): |
| 423 | + return self._aggregate_multiple_funcs(arg), None |
| 424 | + else: |
| 425 | + result = None |
| 426 | + |
| 427 | + cy_func = self._is_cython_func(arg) |
| 428 | + if cy_func and not args and not kwargs: |
| 429 | + return getattr(self, cy_func)(), None |
| 430 | + |
| 431 | + # caller can react |
| 432 | + return result, True |
| 433 | + |
| 434 | + def _aggregate_multiple_funcs(self, arg): |
| 435 | + from pandas.tools.merge import concat |
| 436 | + |
| 437 | + if self.axis != 0: |
| 438 | + raise NotImplementedError("axis other than 0 is not supported") |
| 439 | + |
| 440 | + obj = self._obj_with_exclusions |
| 441 | + results = [] |
| 442 | + keys = [] |
| 443 | + |
| 444 | + # degenerate case |
| 445 | + if obj.ndim == 1: |
| 446 | + for a in arg: |
| 447 | + try: |
| 448 | + colg = self._gotitem(obj.name, ndim=1, subset=obj) |
| 449 | + results.append(colg.aggregate(a)) |
| 450 | + keys.append(getattr(a,'name',a)) |
| 451 | + except (TypeError, DataError): |
| 452 | + pass |
| 453 | + except SpecificationError: |
| 454 | + raise |
| 455 | + |
| 456 | + # multiples |
| 457 | + else: |
| 458 | + for col in obj: |
| 459 | + try: |
| 460 | + colg = self._gotitem(col, ndim=1, subset=obj[col]) |
| 461 | + results.append(colg.aggregate(arg)) |
| 462 | + keys.append(col) |
| 463 | + except (TypeError, DataError): |
| 464 | + pass |
| 465 | + except SpecificationError: |
| 466 | + raise |
| 467 | + result = concat(results, keys=keys, axis=1) |
| 468 | + |
| 469 | + return result |
| 470 | + |
| 471 | + def _is_cython_func(self, arg): |
| 472 | + """ if we define an internal function for this argument, return it """ |
| 473 | + return self._cython_table.get(arg) |
| 474 | + |
| 475 | + def _is_builtin_func(self, arg): |
| 476 | + """ |
| 477 | + if we define an builtin function for this argument, return it, |
| 478 | + otherwise return the arg |
| 479 | + """ |
| 480 | + return self._builtin_table.get(arg, arg) |
| 481 | + |
221 | 482 | class FrozenList(PandasObject, list):
|
222 | 483 |
|
223 | 484 | """
|
|
0 commit comments