Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Other enhancements
^^^^^^^^^^^^^^^^^^
- :meth:`.DataFrameGroupBy.agg` now allows for the provided ``func`` to return a NumPy array (:issue:`63957`)
- Added :meth:`ExtensionArray.count` (:issue:`64450`)
- Added ``observed`` parameter to :func:`crosstab` (:issue:`53521`)
- Display formatting for float sequences in DataFrame cells now respects the ``display.precision`` option (:issue:`60503`).
- Improved the precision of float parsing in :func:`read_csv` (:issue:`64395`)
- Improved the string ``repr`` of :class:`pd.core.arrays.SparseArray` (:issue:`64547`)
Expand Down Expand Up @@ -101,6 +102,7 @@ Deprecations
- Deprecated automatic dtype promotion when reindexing with a ``fill_value`` that cannot be held by the original dtype. Explicitly cast to a common dtype instead (:issue:`53910`)
- Deprecated passing unnecessary ``*args`` and ``**kwargs`` to :meth:`.GroupBy.cumsum`, :meth:`.GroupBy.cumprod`, :meth:`.GroupBy.cummin`, :meth:`.GroupBy.cummax`, :meth:`.SeriesGroupBy.skew`, :meth:`.DataFrameGroupBy.skew`, :meth:`.SeriesGroupBy.take`, and :meth:`.DataFrameGroupBy.take`. The ``skipna`` parameter for the cum* methods is now an explicit keyword argument (:issue:`50407`)
- Deprecated the ``.name`` property of offset objects (e.g., :class:`~pandas.tseries.offsets.Day`, :class:`~pandas.tseries.offsets.Hour`). Use ``.rule_code`` instead (:issue:`64207`)
- Deprecated the ``dropna`` keyword in :func:`pivot_table` and :func:`crosstab`. Manually handle NA values before and after calling these functions instead (:issue:`53521`)
- Deprecated the ``xlrd`` and ``pyxlsb`` engines in :func:`read_excel`. Use ``engine="calamine"`` instead (:issue:`56542`)
- Deprecated the default value of ``exact`` in :func:`assert_index_equal`; in a future version this will default to ``True`` instead of "equiv" (:issue:`57436`)
-
Expand Down Expand Up @@ -223,7 +225,9 @@ Groupby/resample/rolling

Reshaping
^^^^^^^^^
- Bug in :func:`crosstab` where the ``observed`` keyword was not available and was instead incorrectly coupled to the ``dropna`` keyword (:issue:`53521`)
- Bug in :func:`merge` where merging on a :class:`MultiIndex` containing ``NaN`` values mapped ``NaN`` keys to the last level value instead of ``NaN`` (:issue:`64492`)
- Bug in :func:`pivot_table` where the ``observed`` parameter was ignored during margins computation (:issue:`53521`)
- In :func:`pivot_table`, when ``values`` is empty, the aggregation will be computed on a Series of all NA values (:issue:`46475`)
-

Expand Down
26 changes: 11 additions & 15 deletions pandas/_libs/hashtable_class_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -757,11 +757,10 @@ cdef class {{name}}HashTable(HashTable):
append_data_uint8(rmd, 1)
continue

k = kh_get_{{dtype}}(self.table, val)
k = kh_put_{{dtype}}(self.table, val, &ret)

if k == self.table.n_buckets:
if ret != 0:
# k hasn't been seen yet
k = kh_put_{{dtype}}(self.table, val, &ret)

if needs_resize(ud.size, ud.capacity):
with gil:
Expand Down Expand Up @@ -899,12 +898,8 @@ cdef class {{name}}HashTable(HashTable):
labels[i] = -1
continue

k = kh_get_{{dtype}}(self.table, val)
if k != self.table.n_buckets:
idx = self.table.vals[k]
labels[i] = idx
else:
k = kh_put_{{dtype}}(self.table, val, &ret)
k = kh_put_{{dtype}}(self.table, val, &ret)
if ret != 0:
self.table.vals[k] = count

if needs_resize(ud.size, ud.capacity):
Expand All @@ -913,6 +908,9 @@ cdef class {{name}}HashTable(HashTable):
append_data_{{dtype}}(ud, val)
labels[i] = count
count += 1
else:
idx = self.table.vals[k]
labels[i] = idx

arr_uniques = uniques.to_array()

Expand Down Expand Up @@ -1221,10 +1219,9 @@ cdef class StringHashTable(HashTable):
continue

v = vecs[i]
k = kh_get_str(self.table, v)
if k == self.table.n_buckets:
k = kh_put_str(self.table, v, &ret)
if ret != 0:
# k hasn't been seen yet
k = kh_put_str(self.table, v, &ret)
uindexer[count] = i
if return_inverse:
self.table.vals[k] = count
Expand Down Expand Up @@ -1494,10 +1491,9 @@ cdef class PyObjectHashTable(HashTable):
labels[i] = na_sentinel
continue

k = kh_get_pymap(self.table, <PyObject*>val)
if k == self.table.n_buckets:
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
if ret != 0:
# k hasn't been seen yet
k = kh_put_pymap(self.table, <PyObject*>val, &ret)
uniques.append(val)
if return_inverse:
self.table.vals[k] = count
Expand Down
31 changes: 14 additions & 17 deletions pandas/_libs/hashtable_func_helper.pxi.in
Original file line number Diff line number Diff line change
Expand Up @@ -61,18 +61,17 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
if uses_mask:
raise NotImplementedError("uses_mask not implemented with object dtype")

kh_resize_{{ttype}}(table, n // 10)
kh_resize_{{ttype}}(table, n)

for i in range(n):
val = values[i]
if not dropna or not checknull(val):
k = kh_get_{{ttype}}(table, {{to_c_type}}val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
if ret != 0:
table.vals[k] = 1
result_keys.append(val)
else:
table.vals[k] += 1
{{else}}
kh_resize_{{ttype}}(table, n)

Expand All @@ -90,13 +89,12 @@ cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8
if uses_mask and isna_entry:
na_counter += 1
else:
k = kh_get_{{ttype}}(table, val)
if k != table.n_buckets:
table.vals[k] += 1
else:
k = kh_put_{{ttype}}(table, val, &ret)
k = kh_put_{{ttype}}(table, val, &ret)
if ret != 0:
table.vals[k] = 1
result_keys.append(val)
else:
table.vals[k] += 1
{{endif}}

# collect counts in the order corresponding to result_keys:
Expand Down Expand Up @@ -193,14 +191,13 @@ cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', cons

else:
value = {{to_c_type}}(values[i])
k = kh_get_{{ttype}}(table, value)
if k != table.n_buckets:
out[table.vals[k]] = 1
out[i] = 1
else:
k = kh_put_{{ttype}}(table, value, &ret)
k = kh_put_{{ttype}}(table, value, &ret)
if ret != 0:
table.vals[k] = i
out[i] = 0
else:
out[table.vals[k]] = 1
out[i] = 1

kh_destroy_{{ttype}}(table)
return out
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12806,7 +12806,7 @@ def pivot_table(
aggfunc: AggFuncType = "mean",
fill_value=None,
margins: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
margins_name: Level = "All",
observed: bool = True,
sort: bool = True,
Expand Down Expand Up @@ -12854,6 +12854,10 @@ def pivot_table(
* index/column keys containing NA values will be dropped (see ``dropna``
parameter in :meth:`DataFrame.groupby`).

.. deprecated:: 3.1.0
The dropna keyword is deprecated. Manually handle NA values
before and after calling pivot_table.

margins_name : str, default 'All'
Name of the row / column that will contain the totals
when margins is True.
Expand Down
48 changes: 44 additions & 4 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,14 @@
Literal,
cast,
)
import warnings

import numpy as np

from pandas._libs import lib
from pandas.errors import Pandas4Warning
from pandas.util._decorators import set_module
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.cast import maybe_downcast_to_dtype
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -64,7 +67,7 @@ def pivot_table(
aggfunc: AggFuncType = "mean",
fill_value=None,
margins: bool = False,
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
margins_name: Hashable = "All",
observed: bool = True,
sort: bool = True,
Expand Down Expand Up @@ -113,6 +116,10 @@ def pivot_table(
* index/column keys containing NA values will be dropped (see ``dropna``
parameter in :meth:``DataFrame.groupby``).

.. deprecated:: 3.1.0
The dropna keyword is deprecated. Manually handle NA values
before and after calling pivot_table.

margins_name : str, default 'All'
Name of the row / column that will contain the totals
when margins is True.
Expand Down Expand Up @@ -246,6 +253,17 @@ def pivot_table(
foo large 2.000000 5 4.500000 4
small 2.333333 6 4.333333 2
"""
if dropna is not lib.no_default:
warnings.warn(
"The dropna keyword in pivot_table is deprecated and will be "
"removed in a future version. Manually handle NA values before "
"and after calling pivot_table.",
Pandas4Warning,
stacklevel=find_stack_level(),
)
else:
dropna = True

index = _convert_by(index)
columns = _convert_by(columns)

Expand Down Expand Up @@ -407,7 +425,7 @@ def __internal_pivot_table(
cols=columns,
aggfunc=aggfunc,
kwargs=kwargs,
observed=dropna,
observed=observed,
margins_name=margins_name,
fill_value=fill_value,
dropna=dropna,
Expand Down Expand Up @@ -960,8 +978,9 @@ def crosstab(
aggfunc=None,
margins: bool = False,
margins_name: Hashable = "All",
dropna: bool = True,
dropna: bool | lib.NoDefault = lib.no_default,
normalize: bool | Literal[0, 1, "all", "index", "columns"] = False,
observed: bool = True,
) -> DataFrame:
"""
Compute a simple cross tabulation of two (or more) factors.
Expand Down Expand Up @@ -991,6 +1010,11 @@ def crosstab(
when margins is True.
dropna : bool, default True
Do not include columns whose entries are all NaN.

.. deprecated:: 3.1.0
The dropna keyword is deprecated. Manually handle NA values
before and after calling crosstab.

normalize : bool, {'all', 'index', 'columns'}, or {0,1}, default False
Normalize by dividing all values by the sum of values.

Expand All @@ -999,6 +1023,13 @@ def crosstab(
- If passed 'columns' will normalize over each column.
- If margins is `True`, will also normalize margin values.

observed : bool, default True
This only applies if any of the groupers are Categoricals.
If True: only show observed values for categorical groupers.
If False: show all values for categorical groupers.

.. versionadded:: 3.1.0

Returns
-------
DataFrame
Expand Down Expand Up @@ -1098,6 +1129,15 @@ def crosstab(
b 0 1 0
c 0 0 0
"""
if dropna is not lib.no_default:
warnings.warn(
"The dropna keyword in crosstab is deprecated and will be "
"removed in a future version. Manually handle NA values before "
"and after calling crosstab.",
Pandas4Warning,
stacklevel=find_stack_level(),
)

if values is None and aggfunc is not None:
raise ValueError("aggfunc cannot be used without values.")

Expand Down Expand Up @@ -1149,7 +1189,7 @@ def crosstab(
margins=margins,
margins_name=margins_name,
dropna=dropna,
observed=dropna,
observed=observed,
**kwargs, # type: ignore[arg-type]
)

Expand Down
Loading
Loading