diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index adc17c7514832..6eee7e74a7df2 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -306,13 +306,16 @@ def validate_expanding_func(name, args, kwargs): raise UnsupportedFunctionCall(msg) -def validate_groupby_func(name, args, kwargs): +def validate_groupby_func(name, args, kwargs, allowed_kwargs=None): """ - 'args' and 'kwargs' should be empty because all of + 'args' should be empty because all of their necessary parameters are explicitly listed in the function signature """ - if len(args) + len(kwargs) > 0: + if allowed_kwargs: + kwargs = set(kwargs) - set(allowed_kwargs) + + if len(args) or len(kwargs): raise UnsupportedFunctionCall(( "numpy operations are not valid " "with groupby. Use .groupby(...)." diff --git a/pandas/core/base.py b/pandas/core/base.py index 49e43a60403ca..0cc3782d5db3f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -910,12 +910,18 @@ def hasnans(self): return isnull(self).any() def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): """ perform the reduction type operation if we can """ func = getattr(self, name, None) if func is None: raise TypeError("{klass} cannot perform the operation {op}".format( klass=self.__class__.__name__, op=name)) + + if weights is not None: + from pandas.tools import weightby + _, weights = weightby.weightby(self, weights=weights, axis=axis) + kwds['weights'] = weights + return func(**kwds) def value_counts(self, normalize=False, sort=True, ascending=False, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 0562736038483..ec3d4510cda5b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1747,7 +1747,7 @@ def _reverse_indexer(self): # reduction ops # def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): """ perform the reduction type operation """ func = getattr(self, name, None) if func is None: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d96fb094f5d5c..5b9a8481c3a05 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4894,10 +4894,15 @@ def _count_level(self, level, axis=0, numeric_only=False): else: return result - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, op, name, axis=0, skipna=True, weights=None, + numeric_only=None, filter_type=None, **kwds): axis = self._get_axis_number(axis) + if weights is not None: + from pandas.tools import weightby + self, weights = weightby.weightby(self, weights=weights, axis=axis) + kwds['weights'] = weights + def f(x): return op(x, axis=axis, skipna=skipna, **kwds) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 105f9f93f4ca8..46a3868654cd9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4,6 +4,7 @@ import operator import weakref import gc +from textwrap import dedent import numpy as np import pandas.lib as lib @@ -50,6 +51,7 @@ from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lrange, string_types, isidentifier, set_function_name) +from pandas.tools import weightby import pandas.core.nanops as nanops from pandas.util.decorators import Appender, Substitution, deprecate_kwarg from pandas.core import config @@ -2541,146 +2543,91 @@ def tail(self, n=5): return self.iloc[0:0] return self.iloc[-n:] - def sample(self, n=None, frac=None, replace=False, weights=None, - random_state=None, axis=None): - """ - Returns a random sample of items from an axis of object. - - .. versionadded:: 0.16.1 - - Parameters - ---------- - n : int, optional - Number of items from axis to return. Cannot be used with `frac`. - Default = 1 if `frac` = None. - frac : float, optional - Fraction of axis items to return. Cannot be used with `n`. - replace : boolean, optional - Sample with or without replacement. Default = False. - weights : str or ndarray-like, optional - Default 'None' results in equal probability weighting. - If passed a Series, will align with target object on index. Index - values in weights not found in sampled object will be ignored and - index values in sampled object not in weights will be assigned - weights of zero. - If called on a DataFrame, will accept the name of a column - when axis = 0. - Unless weights are a Series, weights must be same length as axis - being sampled. - If weights do not sum to 1, they will be normalized to sum to 1. - Missing values in the weights column will be treated as zero. - inf and -inf values not allowed. - random_state : int or numpy.random.RandomState, optional - Seed for the random number generator (if int), or numpy RandomState - object. - axis : int or string, optional - Axis to sample. Accepts axis number or name. Default is stat axis - for given data type (0 for Series and DataFrames, 1 for Panels). + _shared_docs['sample'] = dedent("""Returns a random sample of items from an axis of object. - Returns - ------- - A new object of same type as caller. +.. versionadded:: 0.16.1 - Examples - -------- - - Generate an example ``Series`` and ``DataFrame``: - - >>> s = pd.Series(np.random.randn(50)) - >>> s.head() - 0 -0.038497 - 1 1.820773 - 2 -0.972766 - 3 -1.598270 - 4 -1.095526 - dtype: float64 - >>> df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD')) - >>> df.head() - A B C D - 0 0.016443 -2.318952 -0.566372 -1.028078 - 1 -1.051921 0.438836 0.658280 -0.175797 - 2 -1.243569 -0.364626 -0.215065 0.057736 - 3 1.768216 0.404512 -0.385604 -1.457834 - 4 1.072446 -1.137172 0.314194 -0.046661 - - Next extract a random sample from both of these objects... - - 3 random elements from the ``Series``: - - >>> s.sample(n=3) - 27 -0.994689 - 55 -1.049016 - 67 -0.224565 - dtype: float64 +Parameters +---------- +n : int, optional + Number of items from axis to return. Cannot be used with `frac`. + Default = 1 if `frac` = None. +frac : float, optional + Fraction of axis items to return. Cannot be used with `n`. +replace : boolean, optional + Sample with or without replacement. Default = False. +%(weights)s +random_state : int or numpy.random.RandomState, optional + Seed for the random number generator (if int), or numpy RandomState + object. +axis : int or string, optional + Axis to sample. Accepts axis number or name. Default is stat axis + for given data type (0 for Series and DataFrames, 1 for Panels). - And a random 10% of the ``DataFrame`` with replacement: +Returns +------- +A new object of same type as caller. - >>> df.sample(frac=0.1, replace=True) - A B C D - 35 1.981780 0.142106 1.817165 -0.290805 - 49 -1.336199 -0.448634 -0.789640 0.217116 - 40 0.823173 -0.078816 1.009536 1.015108 - 15 1.421154 -0.055301 -1.922594 -0.019696 - 6 -0.148339 0.832938 1.787600 -1.383767 - """ +Examples +-------- +Generate an example ``Series`` and ``DataFrame``: + +>>> s = pd.Series(np.random.randn(50)) +>>> s.head() +0 -0.038497 +1 1.820773 +2 -0.972766 +3 -1.598270 +4 -1.095526 +dtype: float64 +>>> df = pd.DataFrame(np.random.randn(50, 4), columns=list('ABCD')) +>>> df.head() + A B C D +0 0.016443 -2.318952 -0.566372 -1.028078 +1 -1.051921 0.438836 0.658280 -0.175797 +2 -1.243569 -0.364626 -0.215065 0.057736 +3 1.768216 0.404512 -0.385604 -1.457834 +4 1.072446 -1.137172 0.314194 -0.046661 + +Next extract a random sample from both of these objects... + +3 random elements from the ``Series``: + +>>> s.sample(n=3) +27 -0.994689 +55 -1.049016 +67 -0.224565 +dtype: float64 + +And a random 10%% of the ``DataFrame`` with replacement: + +>>> df.sample(frac=0.1, replace=True) + A B C D +35 1.981780 0.142106 1.817165 -0.290805 +49 -1.336199 -0.448634 -0.789640 0.217116 +40 0.823173 -0.078816 1.009536 1.015108 +15 1.421154 -0.055301 -1.922594 -0.019696 +6 -0.148339 0.832938 1.787600 -1.383767 +""") + + @Appender(_shared_docs['sample'] % dict( + weights=weightby._shared_docs['weights'], + **_shared_doc_kwargs)) + def sample(self, n=None, frac=None, replace=False, weights=None, + random_state=None, axis=None, **kwargs): if axis is None: axis = self._stat_axis_number axis = self._get_axis_number(axis) - axis_length = self.shape[axis] # Process random_state argument rs = com._random_state(random_state) - # Check weights for compliance if weights is not None: + self, weights = weightby.weightby(self, weights=weights, axis=axis) - # If a series, align with frame - if isinstance(weights, pd.Series): - weights = weights.reindex(self.axes[axis]) - - # Strings acceptable if a dataframe and axis = 0 - if isinstance(weights, string_types): - if isinstance(self, pd.DataFrame): - if axis == 0: - try: - weights = self[weights] - except KeyError: - raise KeyError("String passed to weights not a " - "valid column") - else: - raise ValueError("Strings can only be passed to " - "weights when sampling from rows on " - "a DataFrame") - else: - raise ValueError("Strings cannot be passed as weights " - "when sampling from a Series or Panel.") - - weights = pd.Series(weights, dtype='float64') - - if len(weights) != axis_length: - raise ValueError("Weights and axis to be sampled must be of " - "same length") - - if (weights == np.inf).any() or (weights == -np.inf).any(): - raise ValueError("weight vector may not include `inf` values") - - if (weights < 0).any(): - raise ValueError("weight vector many not include negative " - "values") - - # If has nan, set to zero. - weights = weights.fillna(0) - - # Renormalize if don't sum to 1 - if weights.sum() != 1: - if weights.sum() != 0: - weights = weights / weights.sum() - else: - raise ValueError("Invalid weights: weights sum to zero") - - weights = weights.values + axis_length = self.shape[axis] # If no frac or n, default to n=1. if n is None and frac is None: @@ -5567,14 +5514,20 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, np.putmask(rs.values, mask, np.nan) return rs - def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): + def _agg_by_level(self, name, axis=0, level=0, skipna=True, + weights=None, **kwargs): grouped = self.groupby(level=level, axis=axis) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) axis = self._get_axis_number(axis) method = getattr(type(self), name) - applyf = lambda x: method(x, axis=axis, skipna=skipna, **kwargs) - return grouped.aggregate(applyf) + + def f(x): + if weights is not None: + kwargs['weights'] = weights + return method(x, axis=axis, skipna=skipna, **kwargs) + + return grouped.aggregate(f) @classmethod def _add_numeric_operations(cls): @@ -5659,19 +5612,19 @@ def compound(self, axis=None, skipna=None, level=None): lambda y, axis: np.maximum.accumulate(y, axis), "max", -np.inf, np.nan) - cls.sum = _make_stat_function( + cls.sum = _make_stat_function_weighted( cls, 'sum', name, name2, axis_descr, 'Return the sum of the values for the requested axis', nanops.nansum) - cls.mean = _make_stat_function( + cls.mean = _make_stat_function_weighted( cls, 'mean', name, name2, axis_descr, 'Return the mean of the values for the requested axis', nanops.nanmean) - cls.skew = _make_stat_function( + cls.skew = _make_stat_function_weighted( cls, 'skew', name, name2, axis_descr, 'Return unbiased skew over requested axis\nNormalized by N-1', nanops.nanskew) - cls.kurt = _make_stat_function( + cls.kurt = _make_stat_function_weighted( cls, 'kurt', name, name2, axis_descr, "Return unbiased kurtosis over requested axis using Fisher's " "definition of\nkurtosis (kurtosis of normal == 0.0). Normalized " @@ -5790,6 +5743,28 @@ def _doc_parms(cls): ------- %(outname)s : %(name1)s or %(name2)s (if level specified)\n""" +_num_weighted_doc = """ + +%(desc)s + +Parameters +---------- +axis : %(axis_descr)s +skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA +level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a %(name1)s +%(weights)s +numeric_only : boolean, default None + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. Not implemented for Series. + +Returns +------- +%(outname)s : %(name1)s or %(name2)s (if level specified)\n""" + _num_ddof_doc = """ %(desc)s @@ -5805,6 +5780,7 @@ def _doc_parms(cls): particular level, collapsing into a %(name1)s ddof : int, default 1 degrees of freedom +%(weights)s numeric_only : boolean, default None Include only float, int, boolean columns. If None, will attempt to use everything, then use only numeric data. Not implemented for Series. @@ -5876,12 +5852,34 @@ def stat_func(self, axis=None, skipna=None, level=None, numeric_only=None, return set_function_name(stat_func, name, cls) +def _make_stat_function_weighted(cls, name, name1, name2, axis_descr, desc, f): + @Substitution(outname=name, desc=desc, name1=name1, name2=name2, + axis_descr=axis_descr, + weights=weightby._shared_docs['weights']) + @Appender(_num_weighted_doc) + def stat_func(self, axis=None, skipna=None, level=None, weights=None, + numeric_only=None, **kwargs): + nv.validate_stat_func(tuple(), kwargs, fname=name) + if skipna is None: + skipna = True + if axis is None: + axis = self._stat_axis_number + if level is not None: + return self._agg_by_level(name, axis=axis, level=level, + skipna=skipna, weights=weights) + return self._reduce(f, name, axis=axis, skipna=skipna, + weights=weights, numeric_only=numeric_only) + + return set_function_name(stat_func, name, cls) + + def _make_stat_function_ddof(cls, name, name1, name2, axis_descr, desc, f): @Substitution(outname=name, desc=desc, name1=name1, name2=name2, - axis_descr=axis_descr) + axis_descr=axis_descr, + weights=weightby._shared_docs['weights']) @Appender(_num_ddof_doc) def stat_func(self, axis=None, skipna=None, level=None, ddof=1, - numeric_only=None, **kwargs): + weights=None, numeric_only=None, **kwargs): nv.validate_stat_ddof_func(tuple(), kwargs, fname=name) if skipna is None: skipna = True @@ -5889,9 +5887,10 @@ def stat_func(self, axis=None, skipna=None, level=None, ddof=1, axis = self._stat_axis_number if level is not None: return self._agg_by_level(name, axis=axis, level=level, - skipna=skipna, ddof=ddof) + skipna=skipna, weights=weights, + ddof=ddof) return self._reduce(f, name, axis=axis, numeric_only=numeric_only, - skipna=skipna, ddof=ddof) + weights=weights, skipna=skipna, ddof=ddof) return set_function_name(stat_func, name, cls) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7eba32b4932d0..362263a8b7ce9 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -49,6 +49,7 @@ from pandas.formats.printing import pprint_thing from pandas.util.validators import validate_kwargs +from pandas.tools import weightby import pandas.core.algorithms as algos import pandas.core.common as com from pandas.core.config import option_context @@ -340,10 +341,14 @@ class _GroupBy(PandasObject, SelectionMixin): def __init__(self, obj, keys=None, axis=0, level=None, grouper=None, exclusions=None, selection=None, as_index=True, - sort=True, group_keys=True, squeeze=False, **kwargs): + sort=True, group_keys=True, squeeze=False, ref_obj=None, **kwargs): self._selection = selection + if ref_obj is None: + ref_obj = obj + self.ref_obj = ref_obj + if isinstance(obj, NDFrame): obj._consolidate_inplace() @@ -791,15 +796,23 @@ def _cython_transform(self, how, numeric_only=True): return self._wrap_transformed_output(output, names) - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, weights=None, numeric_only=True): + if weights is not None: + + # TODO, need to integrate this with the exclusions + _, weights = weightby.weightby(self.ref_obj, + weights=weights, + axis=self.axis) + output = {} for name, obj in self._iterate_slices(): is_numeric = is_numeric_dtype(obj.dtype) if numeric_only and not is_numeric: continue + values = weightby.weight(obj.values, weights) try: - result, names = self.grouper.aggregate(obj.values, how) + result, names = self.grouper.aggregate(values, how) except AssertionError as e: raise GroupByError(str(e)) output[name] = self._try_cast(result, obj) @@ -1006,6 +1019,26 @@ def count(self): # defined here for API doc raise NotImplementedError + @Substitution(name='groupby') + @Appender(_doc_template) + def sum(self, *args, **kwargs): + """ + Compute sum of groups, excluding missing values + + For multiple groupings, the result index will be a MultiIndex + """ + + # TODO: this is slightly different from other cythonized functions (e.g. mean) + # to accomodate np.sum functionaility + nv.validate_groupby_func('sum', args, kwargs, ('weights', 'numeric_only')) + self._set_group_selection() + try: + return self._cython_agg_general('add', **kwargs) + except AssertionError as e: + raise SpecificationError(str(e)) + except Exception: # pragma: no cover + return self.aggregate(lambda x: np.sum(x, axis=self.axis)) + @Substitution(name='groupby') @Appender(_doc_template) def mean(self, *args, **kwargs): @@ -1014,14 +1047,15 @@ def mean(self, *args, **kwargs): For multiple groupings, the result index will be a MultiIndex """ - nv.validate_groupby_func('mean', args, kwargs) + nv.validate_groupby_func('mean', args, kwargs, ('weights', 'numeric_only')) try: - return self._cython_agg_general('mean') + return self._cython_agg_general('mean', **kwargs) except GroupByError: raise except Exception: # pragma: no cover self._set_group_selection() - f = lambda x: x.mean(axis=self.axis) + kwargs['axis'] = self.axis + f = lambda x: x.mean(**kwargs) return self._python_agg_general(f) @Substitution(name='groupby') @@ -1107,7 +1141,6 @@ def size(self): """Compute group sizes""" return self.grouper.size() - sum = _groupby_function('sum', 'add', np.sum) prod = _groupby_function('prod', 'prod', np.prod) min = _groupby_function('min', 'min', np.min, numeric_only=False) max = _groupby_function('max', 'max', np.max, numeric_only=False) @@ -3134,9 +3167,9 @@ def _iterate_slices(self): continue yield val, slicer(val) - def _cython_agg_general(self, how, numeric_only=True): + def _cython_agg_general(self, how, **kwargs): new_items, new_blocks = self._cython_agg_blocks( - how, numeric_only=numeric_only) + how, **kwargs) return self._wrap_agged_blocks(new_items, new_blocks) def _wrap_agged_blocks(self, items, blocks): @@ -3162,9 +3195,17 @@ def _wrap_agged_blocks(self, items, blocks): _block_agg_axis = 0 - def _cython_agg_blocks(self, how, numeric_only=True): + def _cython_agg_blocks(self, how, weights=None, numeric_only=True, + **kwargs): data, agg_axis = self._get_data_to_aggregate() + if weights is not None: + + # TODO, need to integrate this with the exclusions + _, weights = weightby.weightby(self.ref_obj, + weights=weights, + axis=self.axis) + new_blocks = [] if numeric_only: @@ -3172,8 +3213,9 @@ def _cython_agg_blocks(self, how, numeric_only=True): for block in data.blocks: + values = weightby.weight(block.values, weights) result, _ = self.grouper.aggregate( - block.values, how, axis=agg_axis) + values, how, axis=agg_axis) # see if we can cast the block back to the original dtype result = block._try_coerce_and_cast_result(result) @@ -3730,19 +3772,20 @@ def _gotitem(self, key, ndim, subset=None): subset : object, default None subset to act on """ - if ndim == 2: if subset is None: subset = self.obj return DataFrameGroupBy(subset, self.grouper, selection=key, grouper=self.grouper, exclusions=self.exclusions, - as_index=self.as_index) + as_index=self.as_index, + ref_obj=self.obj) elif ndim == 1: if subset is None: subset = self.obj[key] return SeriesGroupBy(subset, selection=key, - grouper=self.grouper) + grouper=self.grouper, + ref_obj=self.obj) raise AssertionError("invalid ndim for _gotitem") diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 1f76bc850cee9..d57cfbc9dd44f 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -23,7 +23,7 @@ from pandas.types.missing import isnull, notnull from pandas.core.common import _values_from_object - +from pandas.tools import weightby class disallow(object): def __init__(self, *dtypes): @@ -71,11 +71,14 @@ def __call__(self, alt): bn_func = None @functools.wraps(alt) - def f(values, axis=None, skipna=True, **kwds): + def f(values, axis=None, skipna=True, weights=None, **kwds): if len(self.kwargs) > 0: for k, v in compat.iteritems(self.kwargs): if k not in kwds: kwds[k] = v + + if weights is not None: + kwds['weights'] = weights try: if self.zero_value is not None and values.size == 0: if values.ndim == 1: @@ -91,7 +94,7 @@ def f(values, axis=None, skipna=True, **kwds): result.fill(0) return result - if (_USE_BOTTLENECK and skipna and + if (_USE_BOTTLENECK and skipna and weights is None and _bn_ok_dtype(values.dtype, bn_name)): result = bn_func(values, axis=axis, **kwds) @@ -101,7 +104,8 @@ def f(values, axis=None, skipna=True, **kwds): result = alt(values, axis=axis, skipna=skipna, **kwds) else: result = alt(values, axis=axis, skipna=skipna, **kwds) - except Exception: + except Exception as e: + try: result = alt(values, axis=axis, skipna=skipna, **kwds) except ValueError as e: @@ -169,11 +173,29 @@ def _get_fill_value(dtype, fill_value=None, fill_value_typ=None): return tslib.iNaT -def _get_values(values, skipna, fill_value=None, fill_value_typ=None, - isfinite=False, copy=True): - """ utility to get the values view, mask, dtype +def _get_values(values, skipna, + fill_value=None, fill_value_typ=None, + isfinite=False, weights=None, axis=None, + copy=True): + """ + utility to get the values view, mask, dtype if necessary copy and mask using the specified fill_value - copy = True will force the copy + and adjust for weights + + Parameters + ---------- + values : ndarray + skipna : boolean + fill_value : value, default None + value to fillna + fill_value_typ : value, default None + dtype of the fillvalue + isfinite : boolean, default False + weights : ndarray, optional + normalized ndarray, same length as the axis + axis : axis to broadcast, default None + copy : boolean, default True + True will force the copy """ values = _values_from_object(values) if isfinite: @@ -181,6 +203,8 @@ def _get_values(values, skipna, fill_value=None, fill_value_typ=None, else: mask = isnull(values) + # weights + values = weightby.weight(values, weights) dtype = values.dtype dtype_ok = _na_ok_dtype(dtype) @@ -267,13 +291,16 @@ def nanall(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch(zero_value=0) -def nansum(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna, 0) +def nansum(values, axis=None, skipna=True, weights=None): + values, mask, dtype, dtype_max = _get_values(values, skipna, + 0, weights=weights, + axis=axis) dtype_sum = dtype_max if is_float_dtype(dtype): dtype_sum = dtype elif is_timedelta64_dtype(dtype): dtype_sum = np.float64 + the_sum = values.sum(axis, dtype=dtype_sum) the_sum = _maybe_null_out(the_sum, axis, mask) @@ -282,8 +309,10 @@ def nansum(values, axis=None, skipna=True): @disallow('M8') @bottleneck_switch() -def nanmean(values, axis=None, skipna=True): - values, mask, dtype, dtype_max = _get_values(values, skipna, 0) +def nanmean(values, axis=None, skipna=True, weights=None): + values, mask, dtype, dtype_max = _get_values(values, skipna, + 0, weights=weights, + axis=axis) dtype_sum = dtype_max dtype_count = np.float64 @@ -368,14 +397,14 @@ def _get_counts_nanvar(mask, axis, ddof, dtype=float): @disallow('M8') @bottleneck_switch(ddof=1) -def nanstd(values, axis=None, skipna=True, ddof=1): +def nanstd(values, axis=None, skipna=True, ddof=1, weights=None): result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof)) return _wrap_results(result, values.dtype) @disallow('M8') @bottleneck_switch(ddof=1) -def nanvar(values, axis=None, skipna=True, ddof=1): +def nanvar(values, axis=None, skipna=True, ddof=1, weights=None): dtype = values.dtype mask = isnull(values) @@ -414,7 +443,7 @@ def nanvar(values, axis=None, skipna=True, ddof=1): @disallow('M8', 'm8') -def nansem(values, axis=None, skipna=True, ddof=1): +def nansem(values, axis=None, skipna=True, ddof=1, weights=None): var = nanvar(values, axis, skipna, ddof=ddof) mask = isnull(values) @@ -476,7 +505,7 @@ def nanargmin(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nanskew(values, axis=None, skipna=True): +def nanskew(values, axis=None, skipna=True, weights=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized @@ -531,7 +560,7 @@ def nanskew(values, axis=None, skipna=True): @disallow('M8', 'm8') -def nankurt(values, axis=None, skipna=True): +def nankurt(values, axis=None, skipna=True, weights=None): """ Compute the sample skewness. The statistic computed here is the adjusted Fisher-Pearson standardized diff --git a/pandas/core/panel.py b/pandas/core/panel.py index f708774dd84ff..8685b51083859 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -1101,10 +1101,13 @@ def _apply_2d(self, func, axis): return self._construct_return_type(dict(results)) def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): if numeric_only: raise NotImplementedError('Panel.{0} does not implement ' 'numeric_only.'.format(name)) + if weights is not None: + raise NotImplementedError('Panel.{0} does not implement ' + 'weights.'.format(name)) axis_name = self._get_axis_name(axis) axis_number = self._get_axis_number(axis_name) diff --git a/pandas/core/series.py b/pandas/core/series.py index f656d72296e3a..2075270f79f61 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2299,8 +2299,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): return self._constructor(mapped, index=self.index).__finalize__(self) - def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + def _reduce(self, op, name, axis=0, skipna=True, weights=None, + numeric_only=None, filter_type=None, **kwds): """ perform a reduction operation @@ -2308,6 +2308,11 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, otherwise delegate to the object """ + if weights is not None: + from pandas.tools import weightby + _, weights = weightby.weightby(self, weights=weights, axis=axis) + kwds['weights'] = weights + delegate = self._values if isinstance(delegate, np.ndarray): # Validate that 'axis' is consistent with Series's single axis. @@ -2315,6 +2320,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, if numeric_only: raise NotImplementedError('Series.{0} does not implement ' 'numeric_only.'.format(name)) + with np.errstate(all='ignore'): return op(delegate, skipna=skipna, **kwds) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index d6bc892921c42..d2842ae80088a 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -325,7 +325,7 @@ def __array_finalize__(self, obj): self.fill_value = getattr(obj, 'fill_value', None) def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + weights=None, filter_type=None, **kwds): """ perform a reduction operation """ return op(self.get_values(), skipna=skipna, **kwds) diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py index 3500ce913462a..ba8f5bbfbaf9f 100644 --- a/pandas/tests/test_generic.py +++ b/pandas/tests/test_generic.py @@ -449,60 +449,6 @@ def test_sample(self): self.assertTrue(len(o.sample(frac=0.34) == 3)) self.assertTrue(len(o.sample(frac=0.36) == 4)) - ### - # Check weights - ### - - # Weight length must be right - with tm.assertRaises(ValueError): - o.sample(n=3, weights=[0, 1]) - - with tm.assertRaises(ValueError): - bad_weights = [0.5] * 11 - o.sample(n=3, weights=bad_weights) - - with tm.assertRaises(ValueError): - bad_weight_series = Series([0, 0, 0.2]) - o.sample(n=4, weights=bad_weight_series) - - # Check won't accept negative weights - with tm.assertRaises(ValueError): - bad_weights = [-0.1] * 10 - o.sample(n=3, weights=bad_weights) - - # Check inf and -inf throw errors: - with tm.assertRaises(ValueError): - weights_with_inf = [0.1] * 10 - weights_with_inf[0] = np.inf - o.sample(n=3, weights=weights_with_inf) - - with tm.assertRaises(ValueError): - weights_with_ninf = [0.1] * 10 - weights_with_ninf[0] = -np.inf - o.sample(n=3, weights=weights_with_ninf) - - # All zeros raises errors - zero_weights = [0] * 10 - with tm.assertRaises(ValueError): - o.sample(n=3, weights=zero_weights) - - # All missing weights - nan_weights = [np.nan] * 10 - with tm.assertRaises(ValueError): - o.sample(n=3, weights=nan_weights) - - # Check np.nan are replaced by zeros. - weights_with_nan = [np.nan] * 10 - weights_with_nan[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_nan), o.iloc[5:6]) - - # Check None are also replaced by zeros. - weights_with_None = [None] * 10 - weights_with_None[5] = 0.5 - self._compare( - o.sample(n=1, axis=0, weights=weights_with_None), o.iloc[5:6]) - def test_size_compat(self): # GH8846 # size property should be defined @@ -1579,123 +1525,6 @@ def tester(self): class TestNDFrame(tm.TestCase): # tests that don't fit elsewhere - def test_sample(sel): - # Fixes issue: 2419 - # additional specific object based tests - - # A few dataframe test with degenerate weights. - easy_weight_list = [0] * 10 - easy_weight_list[5] = 1 - - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10, - 'easyweights': easy_weight_list}) - sample1 = df.sample(n=1, weights='easyweights') - assert_frame_equal(sample1, df.iloc[5:6]) - - # Ensure proper error if string given as weight for Series, panel, or - # DataFrame with axis = 1. - s = Series(range(10)) - with tm.assertRaises(ValueError): - s.sample(n=3, weights='weight_column') - - panel = pd.Panel(items=[0, 1, 2], major_axis=[2, 3, 4], - minor_axis=[3, 4, 5]) - with tm.assertRaises(ValueError): - panel.sample(n=1, weights='weight_column') - - with tm.assertRaises(ValueError): - df.sample(n=1, weights='weight_column', axis=1) - - # Check weighting key error - with tm.assertRaises(KeyError): - df.sample(n=3, weights='not_a_real_column_name') - - # Check that re-normalizes weights that don't sum to one. - weights_less_than_1 = [0] * 10 - weights_less_than_1[0] = 0.5 - tm.assert_frame_equal( - df.sample(n=1, weights=weights_less_than_1), df.iloc[:1]) - - ### - # Test axis argument - ### - - # Test axis argument - df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) - second_column_weight = [0, 1] - assert_frame_equal( - df.sample(n=1, axis=1, weights=second_column_weight), df[['col2']]) - - # Different axis arg types - assert_frame_equal(df.sample(n=1, axis='columns', - weights=second_column_weight), - df[['col2']]) - - weight = [0] * 10 - weight[5] = 0.5 - assert_frame_equal(df.sample(n=1, axis='rows', weights=weight), - df.iloc[5:6]) - assert_frame_equal(df.sample(n=1, axis='index', weights=weight), - df.iloc[5:6]) - - # Check out of range axis values - with tm.assertRaises(ValueError): - df.sample(n=1, axis=2) - - with tm.assertRaises(ValueError): - df.sample(n=1, axis='not_a_name') - - with tm.assertRaises(ValueError): - s = pd.Series(range(10)) - s.sample(n=1, axis=1) - - # Test weight length compared to correct axis - with tm.assertRaises(ValueError): - df.sample(n=1, axis=1, weights=[0.5] * 10) - - # Check weights with axis = 1 - easy_weight_list = [0] * 3 - easy_weight_list[2] = 1 - - df = pd.DataFrame({'col1': range(10, 20), - 'col2': range(20, 30), - 'colString': ['a'] * 10}) - sample1 = df.sample(n=1, axis=1, weights=easy_weight_list) - assert_frame_equal(sample1, df[['colString']]) - - # Test default axes - p = pd.Panel(items=['a', 'b', 'c'], major_axis=[2, 4, 6], - minor_axis=[1, 3, 5]) - assert_panel_equal( - p.sample(n=3, random_state=42), p.sample(n=3, axis=1, - random_state=42)) - assert_frame_equal( - df.sample(n=3, random_state=42), df.sample(n=3, axis=0, - random_state=42)) - - # Test that function aligns weights with frame - df = DataFrame( - {'col1': [5, 6, 7], - 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) - s = Series([1, 0, 0], index=[3, 5, 9]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s)) - - # Weights have index values to be dropped because not in - # sampled DataFrame - s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s2)) - - # Weights have empty values to be filed with zeros - s3 = Series([0.01, 0], index=[3, 5]) - assert_frame_equal(df.loc[[3]], df.sample(1, weights=s3)) - - # No overlap in weight and sampled DataFrame indices - s4 = Series([1, 0], index=[1, 2]) - with tm.assertRaises(ValueError): - df.sample(1, weights=s4) - def test_squeeze(self): # noop for s in [tm.makeFloatSeries(), tm.makeStringSeries(), diff --git a/pandas/tools/tests/test_weightby.py b/pandas/tools/tests/test_weightby.py new file mode 100644 index 0000000000000..34aa6399e2cbc --- /dev/null +++ b/pandas/tools/tests/test_weightby.py @@ -0,0 +1,246 @@ +import numpy as np +import pandas as pd + +from pandas import DataFrame, Series +from pandas.util import testing as tm +from pandas.core import common as com + + +class TestWeightsby(tm.TestCase): + + def setUp(self): + self.df = DataFrame({'A': [0.25, 0.25, 0.25, 0.25], + 'B': [1, 2, 3, 4]}) + self.df2 = DataFrame({'A': [1, 2, 3, 4], + 'B': [1, 2, 3, 4]}) + self.df3 = DataFrame({'A': [1, 2, 3, 4], + 'B': [1, 2, 3, 4], + 'C': [1, 1, 2, 2]}) + + @property + def rs(self): + # always return the same starting random state object + return com._random_state(1234) + + def test_basic(self): + + for f in ['sum', 'mean']: + weights = (self.df[['A']] / self.df.A.sum()).values + result = getattr(self.df, f)(weights='A') + expected = getattr(self.df[['B']] * weights, f)() + tm.assert_series_equal(result, expected) + + weights2 = (self.df2[['A']] / self.df2.A.sum()).values + result = getattr(self.df2, f)(weights='A') + expected = getattr(self.df2[['B']] * weights2, f)() + tm.assert_series_equal(result, expected) + + for f in ['kurt', 'skew', 'sem']: + weights = (self.df[['A']] / self.df.A.sum()).values + result = getattr(self.df, f)(weights='A') + expected = getattr(self.df[['B']] * weights, f)() + # tm.assert_series_equal(result, expected) + + weights2 = (self.df2[['A']] / self.df2.A.sum()).values + result = getattr(self.df2, f)(weights='A') + expected = getattr(self.df2[['B']] * weights2, f)() + # tm.assert_series_equal(result, expected) + + for f in ['std', 'var']: + + weights = (self.df[['A']] / self.df.A.sum()).values + result = getattr(self.df, f)(weights='A', ddof=2) + expected = getattr(self.df[['B']] * weights, f)(ddof=2) + # tm.assert_series_equal(result, expected) + + weights2 = (self.df2[['A']] / self.df2.A.sum()).values + result = getattr(self.df2, f)(weights='A', ddof=2) + expected = getattr(self.df2[['B']] * weights2, f)(ddof=2) + # tm.assert_series_equal(result, expected) + + def test_groupby(self): + + for f in ['mean', 'sum']: + + weights = (self.df3['A'] / self.df3.A.sum()).values + result = getattr(self.df3.groupby('C'), f)(weights='A') + adj = self.df3.assign(A=self.df3.A * weights, + B=self.df3.B * weights) + expected = getattr(adj.groupby('C'), f)() + tm.assert_frame_equal(result, expected) + + weights = (self.df3['A'] / self.df3.A.sum()).values + result = getattr(self.df3.groupby('C').B, f)(weights='A') + adj = self.df3.assign(B=self.df3.B * weights) + expected = getattr(adj.groupby('C').B, f)() + tm.assert_series_equal(result, expected) + + def test_unsupported(self): + for f in ['first', 'median', 'min', 'max', 'prod']: + + def func(): + getattr(self.df, f)(weights='A') + self.assertRaises(TypeError, func) + + def test_panel_unsupported(self): + panel = pd.Panel(items=[0, 1, 2], major_axis=[2, 3, 4], + minor_axis=[3, 4, 5]) + with tm.assertRaises(NotImplementedError): + panel.sum(weights='weight_column') + + def test_weights_validation(self): + o = DataFrame(np.random.randn(10, 10)) + + # Weight length must be right + with tm.assertRaises(ValueError): + o.sample(n=3, random_state=self.rs, weights=[0, 1]) + + with tm.assertRaises(ValueError): + bad_weights = [0.5] * 11 + o.sample(n=3, random_state=self.rs, weights=bad_weights) + + # Check won't accept negative weights + with tm.assertRaises(ValueError): + bad_weights = [-0.1] * 10 + o.sample(n=3, random_state=self.rs, weights=bad_weights) + + # Check inf and -inf throw errors: + with tm.assertRaises(ValueError): + weights_with_inf = [0.1] * 10 + weights_with_inf[0] = np.inf + o.sample(n=3, random_state=self.rs, weights=weights_with_inf) + + with tm.assertRaises(ValueError): + weights_with_ninf = [0.1] * 10 + weights_with_ninf[0] = -np.inf + o.sample(n=3, random_state=self.rs, weights=weights_with_ninf) + + # All zeros raises errors + zero_weights = [0] * 10 + with tm.assertRaises(ValueError): + o.sample(n=3, random_state=self.rs, weights=zero_weights) + + # All missing weights + nan_weights = [np.nan] * 10 + with tm.assertRaises(ValueError): + o.sample(n=3, random_state=self.rs, weights=nan_weights) + + # Check np.nan are replaced by zeros. + weights_with_nan = [np.nan] * 10 + weights_with_nan[5] = 0.5 + result = o.sample(n=1, random_state=self.rs, weights=weights_with_nan) + expected = o.iloc[5:6] + tm.assert_frame_equal(result, expected) + + # Check None are also replaced by zeros. + weights_with_None = [None] * 10 + weights_with_None[5] = 0.5 + result = o.sample(n=1, random_state=self.rs, weights=weights_with_None) + expected = o.iloc[5:6] + tm.assert_frame_equal(result, expected) + + def test_weights_strings(self): + # Fixes issue: 2419 + # additional specific object based tests + + # A few dataframe test with degenerate weights. + easy_weight_list = [0] * 10 + easy_weight_list[5] = 1 + + df = pd.DataFrame({'col1': range(10, 20), + 'col2': range(20, 30), + 'colString': ['a'] * 10, + 'easyweights': easy_weight_list}) + result = df.sample(n=1, random_state=self.rs, weights='easyweights') + expected = df[['col1', 'col2', 'colString']].iloc[5:6] + tm.assert_frame_equal(result, expected) + + # Ensure proper error if string given as weight for Series, panel, or + # DataFrame with axis = 1. + s = Series(range(10)) + with tm.assertRaises(ValueError): + s.sample(n=3, random_state=self.rs, weights='weight_column') + + with tm.assertRaises(ValueError): + df.sample(n=1, random_state=self.rs, + weights='weight_column', axis=1) + + # Check weighting key error + with tm.assertRaises(KeyError): + df.sample(n=3, random_state=self.rs, + weights='not_a_real_column_name') + + # Check that re-normalizes weights that don't sum to one. + weights_less_than_1 = [0] * 10 + weights_less_than_1[0] = 0.5 + result = df.sample(n=1, random_state=self.rs, + weights=weights_less_than_1) + expected = df.iloc[[0]] + tm.assert_frame_equal(result, expected) + + def test_weights_axis(self): + + # Test axis argument + df = pd.DataFrame({'col1': range(10), 'col2': ['a'] * 10}) + second_column_weight = [0, 1] + result = df.sample(n=1, random_state=self.rs, + weights=second_column_weight, axis=1) + tm.assert_frame_equal(result, df[['col2']]) + + # Different axis arg types + result = df.sample(n=1, random_state=self.rs, + weights=second_column_weight, axis='columns') + tm.assert_frame_equal(result, df[['col2']]) + + weight = [0] * 10 + weight[5] = 0.5 + result = df.sample(n=1, random_state=self.rs, + weights=weight, axis='index') + expected = df.iloc[5:6] + tm.assert_frame_equal(result, expected) + + # Test weight length compared to correct axis + with tm.assertRaises(ValueError): + df.sample(n=1, random_state=self.rs, weights=[0.5] * 10, axis=1) + + # Check weights with axis = 1 + easy_weight_list = [0] * 3 + easy_weight_list[2] = 1 + + df = pd.DataFrame({'col1': range(10, 20), + 'col2': range(20, 30), + 'colString': ['a'] * 10}) + result = df.sample(n=1, random_state=self.rs, + weights=easy_weight_list, axis=1) + expected = df[['colString']] + tm.assert_frame_equal(result, expected) + + # Test that function aligns weights with frame + df = DataFrame( + {'col1': [5, 6, 7], + 'col2': ['a', 'b', 'c'], }, index=[9, 5, 3]) + s = Series([1, 0, 0], index=[3, 5, 9]) + result = df.sample(1, random_state=self.rs, weights=s) + tm.assert_frame_equal(result, df.loc[[3]]) + + # Weights have index values to be dropped because not in + # sampled DataFrame + s2 = Series([0.001, 0, 10000], index=[3, 5, 10]) + result = df.sample(1, random_state=self.rs, weights=s2) + tm.assert_frame_equal(result, df.loc[[3]]) + + # Weights have empty values to be filed with zeros + s3 = Series([0.01, 0], index=[3, 5]) + result = df.sample(1, random_state=self.rs, weights=s3) + tm.assert_frame_equal(result, df.loc[[3]]) + + # No overlap in weight and sampled DataFrame indices + s4 = Series([1, 0], index=[1, 2]) + with tm.assertRaises(ValueError): + df.sample(1, random_state=self.rs, weights=s4) + + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tools/weightby.py b/pandas/tools/weightby.py new file mode 100644 index 0000000000000..3022d2af0ab15 --- /dev/null +++ b/pandas/tools/weightby.py @@ -0,0 +1,123 @@ +""" +functions to compute weighting Indexes +""" + +import numpy as np +from pandas.types.generic import ABCSeries, ABCDataFrame +from pandas.compat import string_types +from pandas.util.decorators import Substitution + +_shared_docs = {} +_shared_docs['weights'] = """weights : str or ndarray-like, optional + Default 'None' results in equal probability weighting. + + If passed a Series, will align with target object on index. + Index values in weights not found in the target object + will be ignored and index values in the target object + not in weights will be assigned weights of zero. + + If called on a DataFrame, will accept the name of a column + when axis = 0. + + Unless weights are a Series, weights must be same length + as axis of the target object. + + If weights do not sum to 1, they will be normalized to sum to 1. + + Missing values in the weights column will be treated as zero. + inf and -inf values not allowed.""" + + +@Substitution(weights=_shared_docs['weights']) +def weightby(obj, weights=None, axis=0): + """returns a weights Series for the specified weights + +Paramaters +---------- +obj : Series/DataFrame +%(weights)s +axis : {0 (index), 1 (columns)} + axis to compute weights on obj + +Returns +------- +tuple of (obj, ndarray of weights, like indexed to obj)""" + + # If a series, align with frame + if isinstance(weights, ABCSeries): + weights = weights.reindex(obj.axes[axis]) + + # Strings acceptable if a dataframe and axis = 0 + if isinstance(weights, string_types): + + # we use self.obj as we may have a selection here + if isinstance(obj, ABCDataFrame): + if axis == 0: + try: + w, weights = weights, obj[weights] + + # remove the weights column from obj + obj = obj.drop([w], axis=1) + except KeyError: + raise KeyError("String passed to weights is not a " + "valid column") + else: + raise ValueError("Strings can only be passed to " + "weights when weighting by the rows on " + "a DataFrame") + else: + raise ValueError("Strings cannot be passed as weights " + "when weighting from a Series or Panel.") + + from pandas import Series + weights = Series(weights, dtype='float64') + + if len(weights) != len(obj.axes[axis]): + raise ValueError("Weights and axis to be must be of " + "same length") + + if (weights == np.inf).any() or (weights == -np.inf).any(): + raise ValueError("weight vector may not include `inf` values") + + if (weights < 0).any(): + raise ValueError("weight vector many not include negative " + "values") + + # If has nan, set to zero. + weights = weights.fillna(0) + + # Renormalize if don't sum to 1 + if weights.sum() != 1: + if weights.sum() != 0: + weights = weights / weights.sum() + else: + raise ValueError("Invalid weights: weights sum to zero") + + return obj, weights.values + + +def weight(values, weights): + """ + Return the values * weights, broadcasting if needed + + Parameters + ---------- + values : ndarray + weights : 1d-ndarray + + Returns + ------- + values shaped ndarray + """ + + if weights is None: + return values + + if values.ndim == 1: + return values * weights + + elif values.ndim == 2: + + return values * weights + + raise NotImplementedError