diff --git a/.gitignore b/.gitignore index 92a7e4d3edbf6..d1567afef699b 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,5 @@ doc/source/vbench doc/source/vbench.rst doc/source/index.rst doc/build/html/index.html +# Windows specific leftover: +doc/tmp.sv diff --git a/doc/source/api.rst b/doc/source/api.rst index c3cccca3251e4..081b4ff0d3686 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -429,7 +429,7 @@ Time series-related Series.tz_localize String handling -~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~ ``Series.str`` can be used to access the values of the series as strings and apply several methods to it. Due to implementation details the methods show up here as methods of the @@ -468,6 +468,54 @@ details the methods show up here as methods of the StringMethods.upper StringMethods.get_dummies +.. _api.categorical: + +Categorical +~~~~~~~~~~~ + +.. currentmodule:: pandas.core.categorical + +If the Series is of dtype ``category``, ``Series.cat`` can be used to access the the underlying +``Categorical``. This data type is similar to the otherwise underlying numpy array +and has the following usable methods and properties (all available as +``Series.cat.``). + + +.. autosummary:: + :toctree: generated/ + + Categorical + Categorical.levels + Categorical.ordered + Categorical.reorder_levels + Categorical.remove_unused_levels + Categorical.min + Categorical.max + Categorical.mode + +To create compatibility with `pandas.Series` and `numpy` arrays, the following (non-API) methods +are also introduced. Apart from these methods, ``np.asarray(categorical)`` works by implementing the +array interface (`Categorical.__array__()`). Be aware, that this converts the +Categorical back to a numpy array, so levels and order information is not preserved! + +.. autosummary:: + :toctree: generated/ + + Categorical.from_array + Categorical.get_values + Categorical.copy + Categorical.dtype + Categorical.ndim + Categorical.sort + Categorical.describe + Categorical.equals + Categorical.unique + Categorical.order + Categorical.argsort + Categorical.fillna + Categorical.__array__ + + Plotting ~~~~~~~~ .. currentmodule:: pandas diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst new file mode 100644 index 0000000000000..900568eb08f25 --- /dev/null +++ b/doc/source/categorical.rst @@ -0,0 +1,706 @@ +.. _categorical: + +.. currentmodule:: pandas + +.. ipython:: python + :suppress: + + import numpy as np + import random + import os + np.random.seed(123456) + from pandas import options + import pandas as pd + np.set_printoptions(precision=4, suppress=True) + options.display.mpl_style='default' + options.display.max_rows=15 + + +*********** +Categorical +*********** + +.. versionadded:: 0.15 + +.. note:: + While there was in `pandas.Categorical` in earlier versions, the ability to use + `Categorical` data in `Series` and `DataFrame` is new. + + +This is a short introduction to pandas `Categorical` type, including a short comparison with R's +`factor`. + +`Categoricals` are a pandas data type, which correspond to categorical variables in +statistics: a variable, which can take on only a limited, and usually fixed, +number of possible values (commonly called `levels`). Examples are gender, social class, +blood types, country affiliations, observation time or ratings via Likert scales. + +In contrast to statistical categorical variables, a `Categorical` might have an order (e.g. +'strongly agree' vs 'agree' or 'first observation' vs. 'second observation'), but numerical +operations (additions, divisions, ...) are not possible. + +All values of the `Categorical` are either in `levels` or `np.nan`. Order is defined by +the order of the `levels`, not lexical order of the values. Internally, the data structure +consists of a levels array and an integer array of level_codes which point to the real value in the +levels array. + +`Categoricals` are useful in the following cases: + +* A string variable consisting of only a few different values. Converting such a string + variable to a categorical variable will save some memory. +* The lexical order of a variable is not the same as the logical order ("one", "two", "three"). + By converting to a categorical and specifying an order on the levels, sorting and + min/max will use the logical order instead of the lexical order. +* As a signal to other python libraries that this column should be treated as a categorical + variable (e.g. to use suitable statistical methods or plot types) + +See also the :ref:`API docs on Categoricals`. + +Object Creation +--------------- + +Categorical `Series` or columns in a `DataFrame` can be crated in several ways: + +By passing a `Categorical` object to a `Series` or assigning it to a `DataFrame`: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","a"]) + s = pd.Series(raw_cat) + s + df = pd.DataFrame({"A":["a","b","c","a"]}) + df["B"] = raw_cat + df + +By converting an existing `Series` or column to a ``category`` type: + +.. ipython:: python + + df = pd.DataFrame({"A":["a","b","c","a"]}) + df["B"] = df["A"].astype('category') + df + +By using some special functions: + +.. ipython:: python + + df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) + labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10) ] + + df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) + df.head(10) + + +`Categoricals` have a specific ``category`` :ref:`dtype `: + +.. ipython:: python + + df.dtypes + +.. note:: + + In contrast to R's `factor` function, a `Categorical` is not converting input values to + string and levels will end up the same data type as the original values. + +.. note:: + + I contrast to R's `factor` function, there is currently no way to assign/change labels at + creation time. Use `levels` to change the levels after creation time. + +To get back to the original Series or `numpy` array, use ``Series.astype(original_dtype)`` or +``Categorical.get_values()``: + +.. ipython:: python + + s = pd.Series(["a","b","c","a"]) + s + s2 = s.astype('category') + s2 + s3 = s2.astype('string') + s3 + s2.cat.get_values() + +Working with levels +------------------- + +`Categoricals` have a `levels` property, which list their possible values. If you don't +manually specify levels, they are inferred from the passed in values. `Series` of type +``category`` expose the same interface via their `cat` property. + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","a"]) + raw_cat.levels + raw_cat.ordered + # Series of type "category" also expose these interface via the .cat property: + s = pd.Series(raw_cat) + s.cat.levels + s.cat.ordered + +.. note:: + New `Categorical` are automatically ordered if the passed in values are sortable or a + `levels` argument is supplied. This is a difference to R's `factors`, which are unordered + unless explicitly told to be ordered (``ordered=TRUE``). + +It's also possible to pass in the levels in a specific order: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","a"], levels=["c","b","a"]) + s = pd.Series(raw_cat) + s.cat.levels + s.cat.ordered + +.. note:: + + Passing in a `levels` argument implies ``ordered=True``. + +Any value omitted in the levels argument will be replaced by `np.nan`: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","a"], levels=["a","b"]) + s = pd.Series(raw_cat) + s.cat.levels + s + +Renaming levels is done by assigning new values to the ``Category.levels`` or +``Series.cat.levels`` property: + +.. ipython:: python + + s = pd.Series(pd.Categorical(["a","b","c","a"])) + s + s.cat.levels = ["Group %s" % g for g in s.cat.levels] + s + s.cat.levels = [1,2,3] + s + +.. note:: + + I contrast to R's `factor` function, a `Categorical` can have levels of other types than + string. + +Levels must be unique or a `ValueError` is raised: + +.. ipython:: python + + try: + s.cat.levels = [1,1,1] + except ValueError as e: + print("ValueError: " + str(e)) + +Appending a level can be done by assigning a levels list longer than the current levels: + +.. ipython:: python + + s.cat.levels = [1,2,3,4] + s.cat.levels + s + + +Removing a level is also possible, but only the last level(s) can be removed by assigning a +shorter list than current levels. Values which are omitted are replaced by `np.nan`. + +.. ipython:: python + + s.levels = [1,2] + s + +.. note:: + + It's only possible to remove or add a level at the last position. If that's not where you want + to remove an old or add a new level, use ``Category.reorder_levels(new_order)`` or + ``Series.cat.reorder_levels(new_order)`` methods before or after. + +Removing unused levels can also be done: + +.. ipython:: python + + raw = pd.Categorical(["a","b","a"], levels=["a","b","c","d"]) + c = pd.Series(raw) + raw + raw.remove_unused_levels() + raw + c.cat.remove_unused_levels() + c + +.. note:: + + In contrast to R's `factor` function, passing a `Categorical` as the sole input to the + `Categorical` constructor will *not* remove unused levels but create a new `Categorical` + which is equal to the passed in one! + + +Ordered or not... +----------------- + +If a `Categoricals` is ordered (``cat.ordered == True``), then the order of the levels has a +meaning and certain operations are possible. If the the categorical is unordered, +a `TypeError` is raised. + +.. ipython:: python + + s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) + try: + s.sort() + except TypeError as e: + print("TypeError: " + str(e)) + s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=True)) + s.sort() + s + print(s.min(), s.max()) + +.. note:: + ``ordered=True`` is not necessary needed in the second case, as lists of strings are sortable + and so the resulting `Categorical` is ordered. + +Sorting will use the order defined by levels, not any lexical order present on the data type. +This is even true for strings and numeric data: + +.. ipython:: python + + s = pd.Series(pd.Categorical([1,2,3,1])) + s.cat.levels = [2,3,1] + s + s.sort() + s + print(s.min(), s.max()) + +Reordering the levels is possible via the ``Categorical.reorder_levels(new_levels)`` or +``Series.cat.reorder_levels(new_levels)`` methods: + +.. ipython:: python + + s2 = pd.Series(pd.Categorical([1,2,3,1])) + s2.cat.reorder_levels([2,3,1]) + s2 + s2.sort() + s2 + print(s2.min(), s2.max()) + + +.. note:: + Note the difference between assigning new level names and reordering the levels: the first + renames levels and therefore the individual values in the `Series`, but if the first + position was sorted last, the renamed value will still be sorted last. Reordering means that the + way values are sorted is different afterwards, but not that individual values in the + `Series` are changed. + + +Operations +---------- + +The following operations are possible with categorical data: + +Getting the minimum and maximum, if the categorical is ordered: + +.. ipython:: python + + s = pd.Series(pd.Categorical(["a","b","c","a"], levels=["c","a","b","d"])) + print(s.min(), s.max()) + +.. note:: + + If the `Categorical` is not ordered, ``Categorical.min()`` and ``Categorical.max()`` and the + corresponding operations on `Series` will raise `TypeError`. + +The mode: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"]) + s = pd.Series(raw_cat) + raw_cat.mode() + s.mode() + +.. note:: + + Numeric operations like ``+``, ``-``, ``*``, ``/`` and operations based on them (e.g. + ``Categorical.median()``, which would need to compute the mean between two values if the + length of an array is even) do not work and raise a `TypeError`. + +`Series` methods like `Series.value_counts()` will use all levels, even if some levels are not +present in the data: + +.. ipython:: python + + s = pd.Series(pd.Categorical(["a","b","c","c"], levels=["c","a","b","d"])) + s.value_counts() + +Groupby will also show "unused" levels: + +.. ipython:: python + + cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c","d"]) + df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]}) + df.groupby("cats").mean() + + cats2 = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) + df2 = pd.DataFrame({"cats":cats2,"B":["c","d","c","d"], "values":[1,2,3,4]}) + # This doesn't work yet with two columns -> see failing unittests + df2.groupby(["cats","B"]).mean() + + +Pivot tables: + +.. ipython:: python + + raw_cat = pd.Categorical(["a","a","b","b"], levels=["a","b","c"]) + df = pd.DataFrame({"A":raw_cat,"B":["c","d","c","d"], "values":[1,2,3,4]}) + pd.pivot_table(df, values='values', index=['A', 'B']) + +Data munging +------------ + +The optimized pandas data access methods ``.loc``, ``.iloc`` ``ix`` ``.at``, and``.iat``, +work as normal, the only difference is the return type (for getting) and +that only values already in the levels can be assigned. + +Getting +~~~~~~~ + +If the slicing operation returns either a `DataFrame` or a a column of type `Series`, +the ``category`` dtype is preserved. + +.. ipython:: python + + cats = pd.Categorical(["a","b","b","b","c","c","c"], levels=["a","b","c"]) + idx = pd.Index(["h","i","j","k","l","m","n",]) + values= [1,2,2,2,3,4,5] + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) + df.iloc[2:4,:] + df.iloc[2:4,:].dtypes + df.loc["h":"j","cats"] + df.ix["h":"j",0:1] + df[df["cats"] == "b"] + +An example where the `Categorical` is not preserved is if you take one single row: the +resulting `Series` is of dtype ``object``: + +.. ipython:: python + + # get the complete "h" row as a Series + df.loc["h", :] + +Returning a single item from a `Categorical` will also return the value, not a `Categorical` +of length "1". + +.. ipython:: python + + df.iat[0,0] + df["cats"].cat.levels = ["x","y","z"] + df.at["h","cats"] # returns a string + +.. note:: + Note that this is a difference to R's `factor` function, where ``factor(c(1,2,3))[1]`` + returns a single value `factor`. + +To get a single value `Series` of type ``category`` pass in a single value list: + +.. ipython:: python + + df.loc[["h"],"cats"] + +Setting +~~~~~~~ + +Setting values in a categorical column (or `Series`) works as long as the value is included in the +`levels`: + +.. ipython:: python + + cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) + idx = pd.Index(["h","i","j","k","l","m","n"]) + values = [1,1,1,1,1,1,1] + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) + + df.iloc[2:4,:] = [["b",2],["b",2]] + df + try: + df.iloc[2:4,:] = [["c",3],["c",3]] + except ValueError as e: + print("ValueError: " + str(e)) + +Setting values by assigning a `Categorical` will also check that the `levels` match: + +.. ipython:: python + + df.loc["j":"k","cats"] = pd.Categorical(["a","a"], levels=["a","b"]) + df + try: + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + except ValueError as e: + print("ValueError: " + str(e)) + +Assigning a `Categorical` to parts of a column of other types will use the values: + +.. ipython:: python + + df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + df + df.dtypes + + +Merging +~~~~~~~ + +You can concat two `DataFrames` containing categorical data together, +but the levels of these `Categoricals` need to be the same: + +.. ipython:: python + + cat = pd.Categorical(["a","b"], levels=["a","b"]) + vals = [1,2] + df = pd.DataFrame({"cats":cat, "vals":vals}) + pd.concat([df,df]) + + df_different = df.copy() + df_different["cats"].cat.levels = ["a","b","c"] + + try: + pd.concat([df,df]) + except ValueError as e: + print("ValueError: " + str(e)) + +The same applies to ``df.append(df)``. + +Getting Data In/Out +------------------- + +Writing data (`Series`, `Frames`) to a HDF store and reading it in entirety works. Querying the hdf +store does not yet work. + +.. ipython:: python + :suppress: + + hdf_file = "test.h5" + +.. ipython:: python + + hdf_file = "test.h5" + s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c'], levels=['a','b','c','d'])) + df = pd.DataFrame({"s":s, "vals":[1,2,3,4,5,6]}) + df.to_hdf(hdf_file, "frame") + df2 = pd.read_hdf(hdf_file, "frame") + df2 + try: + pd.read_hdf(hdf_file, "frame", where = ['index>2']) + except TypeError as e: + print("TypeError: " + str(e)) + +.. ipython:: python + :suppress: + + try: + os.remove(hdf_file) + except: + pass + + +Writing to a csv file will convert the data, effectively removing any information about the +`Categorical` (`levels` and ordering). So if you read back the csv file you have to convert the +relevant columns back to `category` and assign the right `levels` and level ordering. + +.. ipython:: python + :suppress: + + from pandas.compat import StringIO + csv_file = StringIO + +.. ipython:: python + + s = pd.Series(pd.Categorical(['a', 'b', 'b', 'a', 'a', 'c'], levels=['a','b','c','d'])) + df = pd.DataFrame({"s":s, "vals":[1,2,3,4,5,6]}) + df.to_csv(csv_file) + df2 = pd.read_csv(csv_file) + df2.dtype + df2["vals"] + # Redo the category + df2["vals"] = df2["vals"].astype("category") + df2["vals"].cat.levels = ['a','b','c','d'] + df2.dtype + df2["vals"] + + +Missing Data +------------ + +pandas primarily uses the value `np.nan` to represent missing data. It is by +default not included in computations. See the :ref:`Missing Data section +` + +There are two ways a `np.nan` can be represented in `Categorical`: either the value is not +available or `np.nan` is a valid level. + +.. ipython:: python + + s = pd.Series(pd.Categorical(["a","b",np.nan,"a"])) + s + # only two levels + s.cat.levels + s2 = pd.Series(pd.Categorical(["a","b","c","a"])) + s2.cat.levels = [1,2,np.nan] + s2 + # three levels, np.nan included + # Note: as int arrays can't hold NaN the levels were converted to float + s2.cat.levels + +Gotchas +------- + +`Categorical` is not a `numpy` array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Currently, `Categorical` and the corresponding ``category`` `Series` is implemented as a python +object and not as a low level `numpy` array dtype. This leads to some problems. + +`numpy` itself doesn't know about the new `dtype`: + +.. ipython:: python + + try: + np.dtype("category") + except TypeError as e: + print("TypeError: " + str(e)) + + dtype = pd.Categorical(["a"]).dtype + try: + np.dtype(dtype) + except TypeError as e: + print("TypeError: " + str(e)) + + # dtype comparisons work: + dtype == np.str_ + np.str_ == dtype + +Using ``numpy`` functions on a `Series` of type ``category`` should not work as `Categoricals` +are not numeric data (even in the case that levels is numeric). + +.. ipython:: python + + s = pd.Series(pd.Categorical([1,2,3,4])) + try: + np.sum(s) + #same with np.log(s),.. + except TypeError as e: + print("TypeError: " + str(e)) + +.. note:: + If such a function works, please file a bug at https://github.com/pydata/pandas! + + +Side effects +~~~~~~~~~~~~ + +Constructing a `Series` from a `Categorical` will not copy the input `Categorical`. This +means that changes to the `Series` will in most cases change the original `Categorical`: + +.. ipython:: python + + cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + s = pd.Series(cat, name="cat") + cat + s.iloc[0:2] = 10 + cat + df = pd.DataFrame(s) + df["cat"].cat.levels = [1,2,3,4,5] + cat + +Use ``copy=True`` to prevent such a behaviour: + +.. ipython:: python + cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + s = pd.Series(cat, name="cat", copy=True) + cat + s.iloc[0:2] = 10 + cat + +.. note:: + This also happens in some cases when you supply a `numpy` array: using an int array + (e.g. ``np.array([1,2,3,4])``) will exhibit the same behaviour, but using a string + array (e.g. ``np.array(["a","b","c","a"])``) will not. + + +Danger of confusion +~~~~~~~~~~~~~~~~~~~ + +Both `Series` and `Categorical` have a method ``.reorder_levels()`` . For Series of type +``category`` this means that there is some danger to confuse both methods. + +.. ipython:: python + + s = pd.Series(pd.Categorical([1,2,3,4])) + # wrong and raises an error: + try: + s.reorder_levels([4,3,2,1]) + except Exception as e: + print("Exception: " + str(e)) + # right + print(s.cat.levels) + print([4,3,2,1]) + s.cat.reorder_levels([4,3,2,1]) + +Old style constructor usage +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +I earlier versions, a `Categorical` could be constructed by passing in precomputed `level_codes` +(called then `labels`) instead of values with levels. The `level_codes` are interpreted as pointers +to the levels with `-1` as `NaN`. This usage is now deprecated and not available unless +``compat=True`` is passed to the constructor of `Categorical`. + +.. ipython:: python + + cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) + cat.get_values() + +In the default case (``compat=False``) the first argument is interpreted as values. + +.. ipython:: python + + cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) + cat.get_values() + +.. warning:: + Using Categorical with precomputed level_codes and levels is deprecated and a `FutureWarning` + is raised. Please change your code to use one of the proper constructor modes instead of + adding ``compat=False``. + +No categorical index +~~~~~~~~~~~~~~~~~~~~ + +There is currently no index of type ``category``, so setting the index to a `Categorical` will +convert the `Categorical` to a normal `numpy` array first and therefore remove any custom +ordering of the levels: + +.. ipython:: python + + cats = pd.Categorical([1,2,3,4], levels=[4,2,3,1]) + strings = ["a","b","c","d"] + values = [4,2,3,1] + df = pd.DataFrame({"strings":strings, "values":values}, index=cats) + df.index + # This should sort by levels but doesn't! + df.sort_index() + +dtype in apply +~~~~~~~~~~~~~~ + +Pandas currently does not preserve the dtype in apply functions: If you apply along rows you get +a `Series` of ``object`` `dtype` (same as getting a row -> getting one element will return a +basic type) and applying along columns will also convert to object. + +.. ipython:: python + + df = pd.DataFrame({"a":[1,2,3,4], "b":["a","b","c","d"], "cats":pd.Categorical([1,2,3,2])}) + df.apply(lambda row: type(row["cats"]), axis=1) + df.apply(lambda col: col.dtype, axis=0) + + +Future compatibility +~~~~~~~~~~~~~~~~~~~~ + +As `Categorical` is not a native `numpy` dtype, the implementation details of +`Series.cat` can change if such a `numpy` dtype is implemented. \ No newline at end of file diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index f5352bc1031bc..4e1d2b471d1c0 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -130,6 +130,7 @@ See the package overview for more detail about what's in the library. merging reshaping timeseries + categorical visualization rplot io diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c45256c482e8f..cb6f200b259db 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -93,6 +93,8 @@ def _unique_generic(values, table_type, type_caster): return type_caster(uniques) + + def factorize(values, sort=False, order=None, na_sentinel=-1): """ Encode input values as an enumerated type or categorical variable @@ -160,7 +162,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): if is_datetime: uniques = uniques.astype('M8[ns]') if isinstance(values, Index): - uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None), + uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None), tz=getattr(values, 'tz', None)) elif isinstance(values, Series): uniques = Index(uniques) @@ -196,13 +198,18 @@ def value_counts(values, sort=True, ascending=False, normalize=False, from pandas.tools.tile import cut values = Series(values).values + is_category = com.is_categorical_dtype(values.dtype) if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") - values = cat.labels + values = cat.codes + elif is_category: + bins = values.levels + cat = values + values = cat.codes dtype = values.dtype if com.is_integer_dtype(dtype): @@ -232,7 +239,10 @@ def value_counts(values, sort=True, ascending=False, normalize=False, if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.levels)), fill_value=0) - result.index = bins[:-1] + if not is_category: + result.index = bins[:-1] + else: + result.index = cat.levels if sort: result.sort() @@ -258,7 +268,7 @@ def mode(values): constructor = Series dtype = values.dtype - if com.is_integer_dtype(values.dtype): + if com.is_integer_dtype(values): values = com._ensure_int64(values) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) @@ -267,6 +277,8 @@ def mode(values): values = values.view(np.int64) result = constructor(sorted(htable.mode_int64(values)), dtype=dtype) + elif com.is_categorical_dtype(values): + result = constructor(values.mode()) else: mask = com.isnull(values) values = com._ensure_object(values) diff --git a/pandas/core/base.py b/pandas/core/base.py index ce078eb91735d..81e13687441de 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -203,7 +203,6 @@ def __unicode__(self): quote_strings=True) return "%s(%s, dtype='%s')" % (type(self).__name__, prepr, self.dtype) - class IndexOpsMixin(object): """ common ops mixin to support a unified inteface / docs for Series / Index """ @@ -287,7 +286,11 @@ def unique(self): uniques : ndarray """ from pandas.core.nanops import unique1d - return unique1d(self.values) + values = self.values + if hasattr(values,'unique'): + return values.unique() + + return unique1d(values) def nunique(self, dropna=True): """ diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index dfadd34e2d205..ab1b7bf431e27 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1,13 +1,16 @@ # pylint: disable=E1101,W0232 import numpy as np +from warnings import warn from pandas import compat from pandas.compat import u -from pandas.core.algorithms import factorize +from pandas.core.algorithms import factorize, unique from pandas.core.base import PandasObject -from pandas.core.index import Index +from pandas.core.index import Index, _ensure_index +from pandas.core.indexing import _is_null_slice +from pandas.tseries.period import PeriodIndex import pandas.core.common as com from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option @@ -23,7 +26,7 @@ def f(self, other): else: if other in self.levels: i = self.levels.get_loc(other) - return getattr(self.labels, op)(i) + return getattr(self._codes, op)(i) else: return np.repeat(False, len(self)) @@ -31,64 +34,245 @@ def f(self, other): return f +def _is_categorical(array): + """ return if we are a categorical possibility """ + return isinstance(array, Categorical) or isinstance(array.dtype, com.CategoricalDtype) +def _maybe_to_categorical(array): + """ coerce to a categorical if a series is given """ + if isinstance(array, com.ABCSeries): + return array.values + return array + + +def _get_codes_for_values(values, levels): + from pandas.core.algorithms import _get_data_algo, _hashtables + if values.dtype != levels.dtype: + values = com._ensure_object(values) + levels = com._ensure_object(levels) + (hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables) + t = hash_klass(len(levels)) + t.map_locations(levels) + return com._ensure_platform_int(t.lookup(values)) + +_codes_doc = """The level codes of this categorical. + +Level codes are an array if integer which are the positions of the real +values in the levels array. + +There is not setter, used the other categorical methods and the item setter on +Categorical to change values in the categorical. +""" + +_levels_doc = """The levels of this categorical. + +Setting assigns new values to each level (effectively a rename of +each individual level). + +The assigned value has to be a list-like object. If the number of +level-items is less than number of level-items in the current level, +all level-items at a higher position are set to NaN. If the number of +level-items is more that the current number of level-items, new +(unused) levels are added at the end. + +To add level-items in between, you need to assign them to the end and +then reorder the levels. + +Raises +------ +ValueError + If the new levels do not validate as levels + +See also +-------- +Categorical.reorder_levels +Categorical.remove_unused_levels +""" class Categorical(PandasObject): """ Represents a categorical variable in classic R / S-plus fashion + `Categoricals` can only take on only a limited, and usually fixed, number + of possible values (`levels`). In contrast to statistical categorical + variables, a `Categorical` might have an order, but numerical operations + (additions, divisions, ...) are not possible. + + All values of the `Categorical` are either in `levels` or `np.nan`. + Assigning values outside of `levels` will raise a `ValueError`. Order is + defined by the order of the `levels`, not lexical order of the values. + Parameters ---------- - labels : ndarray of integers - If levels is given, the integer at label `i` is the index of the level - for that label. I.e., the level at labels[i] is levels[labels[i]]. - Otherwise, if levels is None, these are just the labels and the levels - are assumed to be the unique labels. See from_array. + values : list-like + The values of the categorical. If levels are given, values not in levels will + be replaced with NaN. levels : Index-like (unique), optional - The unique levels for each label. If not given, the levels are assumed - to be the unique values of labels. + The unique levels for this categorical. If not given, the levels are assumed + to be the unique values of values. + ordered : boolean, optional + Whether or not this categorical is treated as a ordered categorical. If not given, + the resulting categorical will be ordered if values can be sorted. name : str, optional - Name for the Categorical variable. If levels is None, will attempt - to infer from labels. + Name for the Categorical variable. If name is None, will attempt + to infer from values. + compat : boolean, default=False + Whether to treat values as codes to the levels (old API, deprecated) + + Attributes + ---------- + levels : ndarray + The levels of this categorical + ordered : boolean + Whether or not this Categorical is ordered + name : string + The name of this Categorical + + Raises + ------ + ValueError + If the levels do not validate + TypeError + If an explicit ``ordered=True`` is given but no `levels` and the `values` are not sortable - Returns - ------- - **Attributes** - * labels : ndarray - * levels : ndarray Examples -------- >>> from pandas import Categorical - >>> Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) - Categorical: - array([1, 2, 3, 1, 2, 3]) - Levels (3): Int64Index([1, 2, 3]) - - >>> Categorical([0,1,2,0,1,2], ['a', 'b', 'c']) - Categorical: - array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object) - Levels (3): Index(['a', 'b', 'c'], dtype=object) + >>> Categorical([1, 2, 3, 1, 2, 3]) + 1 + 2 + 3 + 1 + 2 + 3 + Levels (3): Int64Index([1, 2, 3], dtype=int64), ordered >>> Categorical(['a', 'b', 'c', 'a', 'b', 'c']) - Categorical: - array(['a', 'b', 'c', 'a', 'b', 'c'], dtype=object) - Levels (3): Index(['a', 'b', 'c'], dtype=object) + a + b + c + a + b + c + Levels (3): Index(['a', 'b', 'c'], dtype=object), ordered + + >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a']) + >>> a.min() + 'c' """ + ndim = 1 + """Number of dimensions (always 1!)""" + + dtype = com.CategoricalDtype() + """The dtype (always "category")""" + + ordered = None + """Whether or not this Categorical is ordered. + + Only ordered `Categoricals` can be sorted (according to the order + of the levels) and have a min and max value. + + See also + -------- + Categorical.sort + Categorical.order + Categorical.min + Categorical.max + """ + + def __init__(self, values, levels=None, ordered=None, name=None, fastpath=False, compat=False): + + if fastpath: + # fast path + self._codes = values + self.name = name + self.levels = levels + self.ordered = ordered + return + + if name is None: + name = getattr(values, 'name', None) + + # sanitize input + if com.is_categorical_dtype(values): + + # we are either a Series or a Categorical + cat = values + if isinstance(values, com.ABCSeries): + cat = values.values + if levels is None: + levels = cat.levels + if ordered is None: + ordered = cat.ordered + values = values.__array__() + + elif isinstance(values, Index): + pass + + else: + + # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array + # which is fine, but since factorize does this correctly no need here + # this is an issue because _sanitize_array also coerces np.nan to a string + # under certain versions of numpy as well + inferred = com._possibly_infer_to_datetimelike(values) + if not isinstance(inferred, np.ndarray): + from pandas.core.series import _sanitize_array + values = _sanitize_array(values, None) - def __init__(self, labels, levels=None, name=None): if levels is None: - if name is None: - name = getattr(labels, 'name', None) try: - labels, levels = factorize(labels, sort=True) + codes, levels = factorize(values, sort=True) + # If the underlying data structure was sortable, and the user doesn't want to + # "forget" this order, the categorical also is sorted/ordered + if ordered is None: + ordered = True except TypeError: - labels, levels = factorize(labels, sort=False) + codes, levels = factorize(values, sort=False) + if ordered: + # raise, as we don't have a sortable data structure and so the usershould + # give us one by specifying levels + raise TypeError("'values' is not ordered, please explicitly specify the level " + "order by passing in a level argument.") + else: + # there are two ways if levels are present + # the old one, where each value is a int pointer to the levels array + # the new one, where each value is also in the level array (or np.nan) + + # make sure that we always have the same type here, no matter what we get passed in + levels = self._validate_levels(levels) + + # There can be two ways: the old which passed in codes and levels directly + # and values have to be inferred and the new one, which passes in values and levels + # and _codes have to be inferred. + + # min and max can be higher and lower if not all levels are in the values + if compat and (com.is_integer_dtype(values) and + (np.min(values) >= -1) and (np.max(values) < len(levels))): + warn("Using 'values' as codes is deprecated.\n" + "'Categorical(... , compat=True)' is only there for historical reasons and " + "should not be used in new code!\n" + "See https://github.com/pydata/pandas/pull/7217", FutureWarning) + codes = values + else: + codes = _get_codes_for_values(values, levels) + + # if we got levels, we can assume that the order is intended + # if ordered is unspecified + if ordered is None: + ordered = True - self.labels = labels + self.ordered = False if ordered is None else ordered + self._codes = codes self.levels = levels self.name = name + def copy(self): + """ Copy constructor. """ + return Categorical(values=self._codes.copy(),levels=self.levels, + name=self.name, ordered=self.ordered, fastpath=True) + @classmethod def from_array(cls, data): """ @@ -102,20 +286,81 @@ def from_array(cls, data): """ return Categorical(data) - _levels = None + _codes = None - def _set_levels(self, levels): - from pandas.core.index import _ensure_index + def _get_codes(self): + """ Get the level codes. """ + # TODO: return a copy so that no manipulation is possible? + return self._codes + + codes = property(fget=_get_codes, doc=_codes_doc) + _levels = None + + def _validate_levels(self, levels): + """" Validates that we have good levels """ levels = _ensure_index(levels) if not levels.is_unique: raise ValueError('Categorical levels must be unique') + return levels + + def _set_levels(self, levels): + """ Sets new levels """ + levels = self._validate_levels(levels) + + if not self._levels is None and len(levels) < len(self._levels): + # remove all _codes which are larger + self._codes[self._codes >= len(levels)] = -1 self._levels = levels def _get_levels(self): + """ Gets the levels """ + # levels is an Index, which is immutable -> no need to copy return self._levels - levels = property(fget=_get_levels, fset=_set_levels) + levels = property(fget=_get_levels, fset=_set_levels, doc=_levels_doc) + + def reorder_levels(self, new_levels, ordered=None): + """ Reorders levels as specified in new_levels. + + The level reordering is done inplace. + + Raises + ------ + ValueError + If the new levels do not contain the same level items as before + + Parameters + ---------- + new_levels : Index-like + The levels in new order. must be of same length as the old levels + ordered : boolean, optional + Whether or not the categorical is treated as a ordered categorical. If not given, + do not change the ordered information. + """ + new_levels = self._validate_levels(new_levels) + + if len(new_levels) != len(self._levels): + raise ValueError('Reordered levels must be of same length as old levels') + if len(new_levels-self._levels): + raise ValueError('Reordered levels be the same as the original levels') + values = self.__array__() + self._codes = _get_codes_for_values(values, new_levels) + self._levels = new_levels + if not ordered is None: + self.ordered = ordered + + def remove_unused_levels(self): + """ Removes levels which are not used. + + The level removal is done inplace. + """ + _used = sorted(np.unique(self._codes)) + new_levels = self.levels.take(_used) + new_levels = _ensure_index(new_levels) + self._codes = _get_codes_for_values(self.__array__(), new_levels) + self._levels = new_levels + __eq__ = _cat_compare_op('__eq__') __ne__ = _cat_compare_op('__ne__') @@ -124,11 +369,240 @@ def _get_levels(self): __le__ = _cat_compare_op('__le__') __ge__ = _cat_compare_op('__ge__') + # for Series/ndarray like compat + @property + def shape(self): + """ Shape of the Categorical. + + For internal compatibility with numpy arrays. + + Returns + ------- + shape : tuple + """ + + return tuple([len(self._codes)]) + def __array__(self, dtype=None): - return com.take_1d(self.levels.values, self.labels) + """ The numpy array interface. + + Returns + ------- + values : numpy array + A numpy array of the same dtype as categorical.levels.dtype + """ + return com.take_1d(self.levels.values, self._codes) + + @property + def T(self): + return self + + def get_values(self): + """ Return the values. + + For internal compatibility with pandas formatting. + + Returns + ------- + values : numpy array + A numpy array of the same dtype as categorical.levels.dtype or dtype string if periods + """ + + # if we are a period index, return a string repr + if isinstance(self.levels, PeriodIndex): + return com.take_1d(np.array(self.levels.to_native_types(), dtype=object), + self._codes) + + return np.array(self) + + def argsort(self, ascending=True, **kwargs): + """ Implements ndarray.argsort. + + For internal compatibility with numpy arrays. + + Only ordered Categoricals can be argsorted! + + Returns + ------- + argsorted : numpy array + """ + if not self.ordered: + raise TypeError("Categorical not ordered") + result = np.argsort(self._codes.copy(), **kwargs) + if not ascending: + result = result[::-1] + return result + + def order(self, inplace=False, ascending=True, **kwargs): + """ Sorts the Category by level value returning a new Categorical by default. + + Only ordered Categoricals can be sorted! + + Categorical.sort is the equivalent but sorts the Categorical inplace. + + Parameters + ---------- + ascending : boolean, default True + Sort ascending. Passing False sorts descending + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + inplace : boolean, default False + Do operation in place. + + Returns + ------- + y : Category or None + + See Also + -------- + Category.sort + """ + if not self.ordered: + raise TypeError("Categorical not ordered") + _sorted = np.sort(self._codes.copy()) + if not ascending: + _sorted = _sorted[::-1] + if inplace: + self._codes = _sorted + return + else: + return Categorical(values=_sorted,levels=self.levels, ordered=self.ordered, + name=self.name, fastpath=True) + + + def sort(self, inplace=True, ascending=True, **kwargs): + """ Sorts the Category inplace by level value. + + Only ordered Categoricals can be sorted! + + Catgorical.order is the equivalent but returns a new Categorical. + + Parameters + ---------- + ascending : boolean, default True + Sort ascending. Passing False sorts descending + na_position : {'first', 'last'} (optional, default='last') + 'first' puts NaNs at the beginning + 'last' puts NaNs at the end + inplace : boolean, default False + Do operation in place. + + Returns + ------- + y : Category or None + + See Also + -------- + Category.order + """ + return self.order(inplace=inplace, ascending=ascending, **kwargs) + + def ravel(self, order='C'): + """ Return a flattened (numpy) array. + + For internal compatibility with numpy arrays. + + Returns + ------- + raveled : numpy array + """ + return np.array(self) + + def view(self): + """Return a view of myself. + + For internal compatibility with numpy arrays. + + Returns + ------- + view : Categorical + Returns `self`! + """ + return self + + def to_dense(self): + """ Return my 'dense' repr """ + return np.asarray(self) + + def fillna(self, fill_value=None, method=None, limit=None, **kwargs): + """ Fill NA/NaN values using the specified method. + + Parameters + ---------- + method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None + Method to use for filling holes in reindexed Series + pad / ffill: propagate last valid observation forward to next valid + backfill / bfill: use NEXT valid observation to fill gap + value : scalar + Value to use to fill holes (e.g. 0) + limit : int, default None + Maximum size gap to forward or backward fill (not implemented yet!) + + Returns + ------- + filled : Categorical with NA/NaN filled + """ + + if fill_value is None: + fill_value = np.nan + if limit is not None: + raise NotImplementedError + + values = self._codes + + # pad / bfill + if method is not None: + + values = self.to_dense().reshape(-1,len(self)) + values = com.interpolate_2d( + values, method, 0, None, fill_value).astype(self.levels.dtype)[0] + values = _get_codes_for_values(values, self.levels) + + else: + + if not com.isnull(fill_value) and fill_value not in self.levels: + raise ValueError("fill value must be in levels") + + mask = self._codes==-1 + if mask.any(): + values = self._codes.copy() + values[mask] = self.levels.get_loc(fill_value) + + return Categorical(values, levels=self.levels, ordered=self.ordered, + name=self.name, fastpath=True) + + def take_nd(self, indexer, allow_fill=True, fill_value=None): + """ Take the values by the indexer, fill with the fill_value. """ + if allow_fill and fill_value is None: + fill_value = np.nan + + values = com.take_1d(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) + result = Categorical(values=values, levels=self.levels, ordered=self.ordered, + name=self.name, fastpath=True) + return result + + take = take_nd + + def _slice(self, slicer): + """ Return a slice of myself. """ + + # only allow 1 dimensional slicing, but can + # in a 2-d case be passd (slice(None),....) + if isinstance(slicer, tuple) and len(slicer) == 2: + if not _is_null_slice(slicer[0]): + raise AssertionError("invalid slicing for a 1-ndim categorical") + slicer = slicer[1] + + _codes = self._codes[slicer] + return Categorical(values=_codes,levels=self.levels, ordered=self.ordered, + name=self.name, fastpath=True) def __len__(self): - return len(self.labels) + return len(self._codes) + + def __iter__(self): + return iter(np.array(self)) def _tidy_repr(self, max_vals=20): num = max_vals // 2 @@ -151,9 +625,12 @@ def _repr_footer(self): lines = levstring.split('\n') levstring = '\n'.join([lines[0]] + [indent + x.lstrip() for x in lines[1:]]) - + if self.ordered: + order = ", ordered" + else: + order = ", unordered" namestr = "Name: %s, " % self.name if self.name is not None else "" - return u('%s\n%sLength: %d' % (levheader + levstring, namestr, + return u('%s\n%sLength: %d' % (levheader + levstring + order, namestr, len(self))) def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): @@ -164,12 +641,14 @@ def _get_repr(self, name=False, length=True, na_rep='NaN', footer=True): return compat.text_type(result) def __unicode__(self): + """ Unicode representation. """ width, height = get_terminal_size() max_rows = (height if get_option("display.max_rows") == 0 else get_option("display.max_rows")) - if len(self.labels) > (max_rows or 1000): + + if len(self._codes) > (max_rows or 1000): result = self._tidy_repr(min(30, max_rows) - 4) - elif len(self.labels) > 0: + elif len(self._codes) > 0: result = self._get_repr(length=len(self) > 50, name=True) else: @@ -181,22 +660,166 @@ def __unicode__(self): return result def __getitem__(self, key): + """ Return an item. """ if isinstance(key, (int, np.integer)): - i = self.labels[key] + i = self._codes[key] if i == -1: return np.nan else: return self.levels[i] else: - return Categorical(self.labels[key], self.levels) + return Categorical(values=self._codes[key], levels=self.levels, + ordered=self.ordered, fastpath=True) + + def __setitem__(self, key, value): + """ Item assignment. + + + Raises + ------ + ValueError + If (one or more) Value is not in levels or if a assigned `Categorical` has not the + same levels + + """ + + # require identical level set + if isinstance(value, Categorical): + if not value.levels.equals(self.levels): + raise ValueError("cannot set a Categorical with another, without identical levels") + + rvalue = value if com.is_list_like(value) else [value] + to_add = Index(rvalue)-self.levels + if len(to_add): + raise ValueError("cannot setitem on a Categorical with a new level," + " set the levels first") + + # set by position + if isinstance(key, (int, np.integer)): + pass + + # tuple of indexers + elif isinstance(key, tuple): + + # only allow 1 dimensional slicing, but can + # in a 2-d case be passd (slice(None),....) + if len(key) == 2: + if not _is_null_slice(key[0]): + raise AssertionError("invalid slicing for a 1-ndim categorical") + key = key[1] + elif len(key) == 1: + key = key[0] + else: + raise AssertionError("invalid slicing for a 1-ndim categorical") + + else: + key = self._codes[key] + + lindexer = self.levels.get_indexer(rvalue) + self._codes[key] = lindexer + + #### reduction ops #### + def _reduce(self, op, axis=0, skipna=True, numeric_only=None, + filter_type=None, name=None, **kwds): + """ perform the reduction type operation """ + func = getattr(self,name,None) + if func is None: + raise TypeError("Categorical cannot perform the operation {op}".format(op=name)) + return func(numeric_only=numeric_only, **kwds) + + def min(self, numeric_only=None, **kwargs): + """ The minimum value of the object. + + Only ordered `Categoricals` have a minimum! + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + min : the minimum of this `Categorical` + """ + if not self.ordered: + raise TypeError("Categorical not ordered") + if numeric_only: + good = self._codes != -1 + pointer = self._codes[good].min(**kwargs) + else: + pointer = self._codes.min(**kwargs) + if pointer == -1: + return np.nan + else: + return self.levels[pointer] + + + def max(self, numeric_only=None, **kwargs): + """ The maximum value of the object. + + Only ordered `Categoricals` have a maximum! + + Raises + ------ + TypeError + If the `Categorical` is not `ordered`. + + Returns + ------- + max : the maximum of this `Categorical` + """ + if not self.ordered: + raise TypeError("Categorical not ordered") + if numeric_only: + good = self._codes != -1 + pointer = self._codes[good].max(**kwargs) + else: + pointer = self._codes.max(**kwargs) + if pointer == -1: + return np.nan + else: + return self.levels[pointer] + + def mode(self): + """ + Returns the mode(s) of the Categorical. + + Empty if nothing occurs at least 2 times. Always returns `Categorical` even + if only one value. + + Returns + ------- + modes : `Categorical` (sorted) + """ + + import pandas.hashtable as htable + good = self._codes != -1 + result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))), + levels=self.levels,ordered=self.ordered, name=self.name, + fastpath=True) + return result + + def unique(self): + """ + Return the unique values. + + This includes all levels, even if one or more is unused. + + Returns + ------- + unique values : array + """ + return self.levels def equals(self, other): """ - Returns True if categorical arrays are equal + Returns True if categorical arrays are equal. + + The name of the `Categorical` is not compared! Parameters ---------- - other : Categorical + other : `Categorical` Returns ------- @@ -204,23 +827,31 @@ def equals(self, other): """ if not isinstance(other, Categorical): return False - - return (self.levels.equals(other.levels) and - np.array_equal(self.labels, other.labels)) + # TODO: should this also test if name is equal? + return (self.levels.equals(other.levels) and self.ordered == other.ordered and + np.array_equal(self._codes, other._codes)) def describe(self): - """ - Returns a dataframe with frequency and counts by level. + """ Describes this Categorical + + Returns + ------- + description: `DataFrame` + A dataframe with frequency and counts by level. """ # Hack? from pandas.core.frame import DataFrame counts = DataFrame({ - 'labels' : self.labels, - 'values' : self.labels } - ).groupby('labels').count().squeeze().values + 'codes' : self._codes, + 'values' : self._codes } + ).groupby('codes').count() + + counts.index = self.levels.take(counts.index) + counts = counts.reindex(self.levels) freqs = counts / float(counts.sum()) - return DataFrame({ - 'counts': counts, - 'freqs': freqs, - 'levels': self.levels - }).set_index('levels') + + from pandas.tools.merge import concat + result = concat([counts,freqs],axis=1) + result.index.name = 'levels' + result.columns = ['counts','freqs'] + return result diff --git a/pandas/core/common.py b/pandas/core/common.py index bb7f43511e905..a6e1c51595b15 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -23,7 +23,6 @@ from pandas.core.config import get_option from pandas.core import array as pa - class PandasError(Exception): pass @@ -107,6 +106,37 @@ class to receive bound method else: setattr(cls, name, func) +class CategoricalDtype(object): + """ + A np.dtype duck-typed class, suitable for holding a custom categorical dtype. + + THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + """ + name = 'category' + names = None + type = np.object_ + subdtype = None + kind = 'O' + str = '|O08' + num = 100 + shape = tuple() + itemsize = 8 + base = np.dtype('O') + isbuiltin = 0 + isnative = 0 + + def __unicode__(self): + return self.name + + def __hash__(self): + # make myself hashable + return hash(str(self)) + + def __eq__(self, other): + if isinstance(other, compat.string_types): + return other == self.name + + return isinstance(other, CategoricalDtype) def isnull(obj): """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) @@ -2318,6 +2348,15 @@ def is_bool_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.bool_) +def is_categorical_dtype(arr_or_dtype): + if hasattr(arr_or_dtype,'dtype'): + arr_or_dtype = arr_or_dtype.dtype + if isinstance(arr_or_dtype, CategoricalDtype): + return True + try: + return arr_or_dtype == 'category' + except: + return False def is_complex_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) diff --git a/pandas/core/format.py b/pandas/core/format.py index b11b2e7270271..bae67352f4f09 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -96,7 +96,10 @@ def _get_footer(self): levheader = 'Levels (%d): ' % len(self.categorical.levels) # TODO: should max_line_width respect a setting? - levstring = np.array_repr(self.categorical.levels, max_line_width=60) + try: + levstring = np.array_repr(self.categorical.levels, max_line_width=60) + except: + levstring = str(self.categorical.levels) indent = ' ' * (levstring.find('[') + len(levheader) + 1) lines = levstring.split('\n') levstring = '\n'.join([lines[0]] + @@ -104,11 +107,15 @@ def _get_footer(self): if footer: footer += ', ' footer += levheader + levstring + if self.categorical.ordered: + footer += ", ordered" + else: + footer += ", unordered" return compat.text_type(footer) def _get_formatted_values(self): - return format_array(np.asarray(self.categorical), None, + return format_array(self.categorical.get_values(), None, float_format=None, na_rep=self.na_rep) @@ -191,7 +198,7 @@ def _get_formatted_index(self): return fmt_index, have_header def _get_formatted_values(self): - return format_array(self.series.values, None, + return format_array(self.series.get_values(), None, float_format=self.float_format, na_rep=self.na_rep) @@ -829,7 +836,7 @@ def _column_header(): ins_col = self.fmt.tr_col_num if self.fmt.sparsify: recs_new = {} - # Increment tags after ... col. + # Increment tags after ... col. for tag,span in list(records.items()): if tag >= ins_col: recs_new[tag + 1] = span @@ -844,7 +851,7 @@ def _column_header(): else: recs_new[tag] = span # if ins_col lies between tags, all col headers get ... - if tag + span == ins_col: + if tag + span == ins_col: recs_new[ins_col] = 1 values = values[:ins_col] + (u('...'),) + \ values[ins_col:] @@ -895,7 +902,7 @@ def _column_header(): ] + [''] * min(len(self.columns), self.max_cols) if truncate_h: ins_col = row_levels + self.fmt.tr_col_num - row.insert(ins_col, '') + row.insert(ins_col, '') self.write_tr(row, indent, self.indent_delta, header=True) indent -= self.indent_delta @@ -981,7 +988,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): inner_lvl = len(level_lengths) - 1 if truncate_v: # Insert ... row and adjust idx_values and - # level_lengths to take this into account. + # level_lengths to take this into account. ins_row = self.fmt.tr_row_num for lnum,records in enumerate(level_lengths): rec_new = {} @@ -999,7 +1006,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: - idx_values.insert(ins_row,tuple([u('...')]*len(level_lengths))) + idx_values.insert(ins_row,tuple([u('...')]*len(level_lengths))) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b97cb11906e2f..f07ce11a78761 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -36,6 +36,7 @@ create_block_manager_from_arrays, create_block_manager_from_blocks) from pandas.core.series import Series +from pandas.core.categorical import Categorical import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval from numpy import percentile as _quantile @@ -1539,7 +1540,7 @@ def get_value(self, index, col, takeable=False): series = self._get_item_cache(col) engine = self.index._engine - return engine.get_value(series.values, index) + return engine.get_value(series.get_values(), index) def set_value(self, index, col, value, takeable=False): """ @@ -1567,7 +1568,7 @@ def set_value(self, index, col, value, takeable=False): engine = self.index._engine engine.set_value(series.values, index, value) return self - except KeyError: + except (KeyError, TypeError): # set using a non-recursive method & reset the cache self.loc[index, col] = value @@ -2114,10 +2115,10 @@ def _sanitize_column(self, key, value): # Need to make sure new columns (which go into the BlockManager as new # blocks) are always copied - if isinstance(value, (Series, DataFrame)): - is_frame = isinstance(value, DataFrame) + def reindexer(value): + # reindex if necessary + if value.index.equals(self.index) or not len(self.index): - # copy the values value = value.values.copy() else: @@ -2133,10 +2134,18 @@ def _sanitize_column(self, key, value): # other raise TypeError('incompatible index of inserted column ' 'with frame index') + return value + + if isinstance(value, Series): + value = reindexer(value) + + elif isinstance(value, DataFrame): + value = reindexer(value).T - if is_frame: - value = value.T - elif isinstance(value, Index) or _is_sequence(value): + elif isinstance(value, Categorical): + value = value.copy() + + elif (isinstance(value, Index) or _is_sequence(value)): if len(value) != len(self.index): raise ValueError('Length of values does not match length of ' 'index') @@ -2160,6 +2169,10 @@ def _sanitize_column(self, key, value): value = np.repeat(value, len(self.index)).astype(dtype) value = com._possibly_cast_to_datetime(value, dtype) + # return categoricals directly + if isinstance(value, Categorical): + return value + # broadcast across multiple columns if necessary if key in self.columns and value.ndim == 1: if not self.columns.is_unique or isinstance(self.columns, @@ -2757,6 +2770,7 @@ def trans(v): % str(by)) if isinstance(ascending, (tuple, list)): ascending = ascending[0] + indexer = _nargsort(k, kind=kind, ascending=ascending, na_position=na_position) @@ -4069,7 +4083,7 @@ def all(self, axis=None, bool_only=None, skipna=True, level=None, numeric_only=bool_only, filter_type='bool') def _reduce(self, op, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + filter_type=None, name=None, **kwds): axis = self._get_axis_number(axis) f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) labels = self._get_agg_axis(axis) @@ -4600,10 +4614,10 @@ def factor_agg(factor, vec, func): -------- pandas.Categorical """ - indexer = np.argsort(factor.labels) + indexer = np.argsort(factor.codes) unique_labels = np.arange(len(factor.levels)) - ordered_labels = factor.labels.take(indexer) + ordered_labels = factor.codes.take(indexer) ordered_vec = np.asarray(vec).take(indexer) bounds = ordered_labels.searchsorted(unique_labels) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 59a457229d512..57cd2eb50d01a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3626,7 +3626,7 @@ def describe_numeric_1d(series, percentiles): def describe_categorical_1d(data): names = ['count', 'unique'] objcounts = data.value_counts() - result = [data.count(), len(objcounts)] + result = [data.count(), len(objcounts[objcounts!=0])] if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] @@ -3782,7 +3782,8 @@ def stat_func(self, axis=None, skipna=None, level=None, return self._agg_by_level(name, axis=axis, level=level, skipna=skipna) return self._reduce(f, axis=axis, - skipna=skipna, numeric_only=numeric_only) + skipna=skipna, numeric_only=numeric_only, + name=name) stat_func.__name__ = name return stat_func diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 249aa0afdfd64..7c7b1871b6770 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -23,7 +23,8 @@ import pandas.core.common as com from pandas.core.common import(_possibly_downcast_to_dtype, isnull, notnull, _DATELIKE_DTYPES, is_numeric_dtype, - is_timedelta64_dtype, is_datetime64_dtype) + is_timedelta64_dtype, is_datetime64_dtype, + is_categorical_dtype) from pandas import _np_version_under1p7 import pandas.lib as lib @@ -147,8 +148,10 @@ def _last(x): def _count_compat(x, axis=0): - return x.size - + try: + return x.size + except: + return x.count() class Grouper(object): """ @@ -1866,7 +1869,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # Is there any way to avoid this? self.grouper = np.asarray(factor) - self._labels = factor.labels + self._labels = factor.codes self._group_index = factor.levels if self.name is None: self.name = factor.name @@ -3419,6 +3422,11 @@ def _nargsort(items, kind='quicksort', ascending=True, na_position='last'): It adds ascending and na_position parameters. GH #6399, #5231 """ + + # specially handle Categorical + if is_categorical_dtype(items): + return items.argsort(ascending=ascending) + items = np.asanyarray(items) idx = np.arange(len(items)) mask = isnull(items) diff --git a/pandas/core/index.py b/pandas/core/index.py index 262305a335d46..6927d5a732440 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -2796,7 +2796,7 @@ def from_arrays(cls, arrays, sortorder=None, names=None): cats = [Categorical.from_array(arr) for arr in arrays] levels = [c.levels for c in cats] - labels = [c.labels for c in cats] + labels = [c.codes for c in cats] if names is None: names = [c.name for c in cats] @@ -2888,7 +2888,7 @@ def from_product(cls, iterables, sortorder=None, names=None): from pandas.tools.util import cartesian_product categoricals = [Categorical.from_array(it) for it in iterables] - labels = cartesian_product([c.labels for c in categoricals]) + labels = cartesian_product([c.codes for c in categoricals]) return MultiIndex(levels=[c.levels for c in categoricals], labels=labels, sortorder=sortorder, names=names) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index d387cb647d8c2..367a283958051 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -405,7 +405,9 @@ def can_do_equal_len(): return False - if _is_list_like(value): + # we need an interable, with a ndim of at least 1 + # eg. don't pass thru np.array(0) + if _is_list_like(value) and getattr(value,'ndim',1) > 0: # we have an equal len Frame if isinstance(value, ABCDataFrame) and value.ndim > 1: @@ -1675,7 +1677,7 @@ def _is_label_like(key): def _is_list_like(obj): # Consider namedtuples to be not list like as they are useful as indices - return (np.iterable(obj) + return (hasattr(obj, '__iter__') and not isinstance(obj, compat.string_types) and not (isinstance(obj, tuple) and type(obj) is not tuple)) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 4f7f36dd4a14d..c6a5b048a8ba7 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -16,6 +16,7 @@ _possibly_infer_to_datetimelike) from pandas.core.index import Index, MultiIndex, _ensure_index from pandas.core.indexing import (_maybe_convert_indices, _length_of_indexer) +from pandas.core.categorical import Categorical, _maybe_to_categorical, _is_categorical import pandas.core.common as com from pandas.sparse.array import _maybe_to_sparse, SparseArray import pandas.lib as lib @@ -49,11 +50,13 @@ class Block(PandasObject): is_timedelta = False is_bool = False is_object = False + is_categorical = False is_sparse = False _can_hold_na = False _downcast_dtype = None _can_consolidate = True _verify_integrity = True + _validate_ndim = True _ftype = 'dense' def __init__(self, values, placement, ndim=None, fastpath=False): @@ -84,6 +87,9 @@ def is_datelike(self): """ return True if I am a non-datelike """ return self.is_datetime or self.is_timedelta + def to_dense(self): + return self.values.view() + @property def fill_value(self): return np.nan @@ -92,7 +98,12 @@ def fill_value(self): def mgr_locs(self): return self._mgr_locs - def make_block_same_class(self, values, placement, copy=False, + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an array """ + return self.dtype + + def make_block_same_class(self, values, placement, copy=False, fastpath=True, **kwargs): """ Wrap given values in a block of same type as self. @@ -103,7 +114,7 @@ def make_block_same_class(self, values, placement, copy=False, if copy: values = values.copy() return make_block(values, placement, klass=self.__class__, - fastpath=True) + fastpath=fastpath, **kwargs) @mgr_locs.setter def mgr_locs(self, new_mgr_locs): @@ -161,7 +172,7 @@ def getitem_block(self, slicer, new_mgr_locs=None): new_values = self._slice(slicer) - if new_values.ndim != self.ndim: + if self._validate_ndim and new_values.ndim != self.ndim: raise ValueError("Only same dim slicing is allowed") return self.make_block_same_class(new_values, new_mgr_locs) @@ -326,6 +337,15 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, Coerce to the new type (if copy=True, return a new copy) raise on an except if raise == True """ + + # may need to convert to categorical + # this is only called for non-categoricals + if com.is_categorical_dtype(dtype): + return make_block(Categorical(self.values), + ndim=self.ndim, + placement=self.mgr_locs) + + # astype processing dtype = np.dtype(dtype) if self.dtype == dtype: if copy: @@ -431,6 +451,10 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs): values[mask] = na_rep return values.tolist() + def _validate_merge(self, blocks): + """ validate that we can merge these blocks """ + return True + # block actions #### def copy(self, deep=True): values = self.values @@ -1014,6 +1038,72 @@ def equals(self, other): return np.array_equal(self.values, other.values) +class NonConsolidatableMixIn(object): + """ hold methods for the nonconsolidatable blocks """ + _can_consolidate = False + _verify_integrity = False + _validate_ndim = False + _holder = None + + def __init__(self, values, placement, + ndim=None, fastpath=False,): + + # kludgetastic + if ndim is None: + if len(placement) != 1: + ndim = 1 + else: + ndim = 2 + self.ndim = ndim + + self.mgr_locs = placement + + if not isinstance(values, self._holder): + raise TypeError("values must be {0}".format(self._holder.__name__)) + + self.values = values + + def get_values(self, dtype=None): + """ need to to_dense myself (and always return a ndim sized object) """ + values = self.values.to_dense() + if values.ndim == self.ndim - 1: + values = values.reshape((1,) + values.shape) + return values + + def iget(self, col): + + if self.ndim == 2 and isinstance(col, tuple): + col, loc = col + if col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values[loc] + else: + if col != 0: + raise IndexError("{0} only contains one item".format(self)) + return self.values + + def should_store(self, value): + return isinstance(value, self._holder) + + def set(self, locs, values, check=False): + assert locs.tolist() == [0] + self.values = values + + def get(self, item): + if self.ndim == 1: + loc = self.items.get_loc(item) + return self.values[loc] + else: + return self.values + + def _slice(self, slicer): + """ return a slice of my values (but densify first) """ + return self.get_values()[slicer] + + def _try_cast_result(self, result, dtype=None): + return result + + class NumericBlock(Block): __slots__ = () is_numeric = True @@ -1444,6 +1534,110 @@ def re_replacer(s): make_block(new_values, fastpath=True, placement=self.mgr_locs)] +class CategoricalBlock(NonConsolidatableMixIn, ObjectBlock): + __slots__ = () + is_categorical = True + _can_hold_na = True + _holder = Categorical + + def __init__(self, values, placement, + fastpath=False, **kwargs): + + # coerce to categorical if we can + super(CategoricalBlock, self).__init__(_maybe_to_categorical(values), + fastpath=True, placement=placement, + **kwargs) + + def to_dense(self): + return self.values.to_dense().view() + + @property + def shape(self): + return (len(self.mgr_locs), len(self.values)) + + @property + def array_dtype(self): + """ the dtype to return if I want to construct this block as an array """ + return np.object_ + + def _slice(self, slicer): + """ return a slice of my values """ + + # slice the category + # return same dims as we currently have + return self.values._slice(slicer) + + def fillna(self, value, limit=None, inplace=False, downcast=None): + # we may need to upcast our fill to match our dtype + if limit is not None: + raise NotImplementedError + + values = self.values if inplace else self.values.copy() + return [self.make_block_same_class(values=values.fillna(fill_value=value, + limit=limit), + placement=self.mgr_locs)] + + def interpolate(self, method='pad', axis=0, inplace=False, + limit=None, fill_value=None, **kwargs): + + values = self.values if inplace else self.values.copy() + return self.make_block_same_class(values=values.fillna(fill_value=fill_value, + method=method, + limit=limit), + placement=self.mgr_locs) + + def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): + """ + Take values according to indexer and return them as a block.bb + + """ + if fill_tuple is None: + fill_value = None + else: + fill_value = fill_tuple[0] + + # axis doesn't matter; we are really a single-dim object + # but are passed the axis depending on the calling routing + # if its REALLY axis 0, then this will be a reindex and not a take + new_values = self.values.take_nd(indexer, fill_value=fill_value) + + # if we are a 1-dim object, then always place at 0 + if self.ndim == 1: + new_mgr_locs = [0] + else: + if new_mgr_locs is None: + new_mgr_locs = self.mgr_locs + + return self.make_block_same_class(new_values, new_mgr_locs) + + def _astype(self, dtype, copy=False, raise_on_error=True, values=None, + klass=None): + """ + Coerce to the new type (if copy=True, return a new copy) + raise on an except if raise == True + """ + + if dtype == com.CategoricalDtype(): + values = self.values + else: + values = np.array(self.values).astype(dtype) + + if copy: + values = values.copy() + + return make_block(values, + ndim=self.ndim, + placement=self.mgr_locs) + + def _validate_merge(self, blocks): + """ validate that we can merge these blocks """ + + levels = self.values.levels + for b in blocks: + if not levels.equals(b.values.levels): + raise ValueError("incompatible levels in categorical block merge") + + return True class DatetimeBlock(Block): __slots__ = () @@ -1589,16 +1783,14 @@ def get_values(self, dtype=None): .reshape(self.values.shape) return self.values - -class SparseBlock(Block): +class SparseBlock(NonConsolidatableMixIn, Block): """ implement as a list of sparse arrays of the same dtype """ __slots__ = () is_sparse = True is_numeric = True _can_hold_na = True - _can_consolidate = False - _verify_integrity = False _ftype = 'sparse' + _holder = SparseArray def __init__(self, values, placement, ndim=None, fastpath=False,): @@ -1653,11 +1845,6 @@ def sp_values(self, v): fill_value=self.values.fill_value, copy=False) - def iget(self, col): - if col != 0: - raise IndexError("SparseBlock only contains one item") - return self.values - @property def sp_index(self): return self.values.sp_index @@ -1672,31 +1859,6 @@ def __len__(self): except: return 0 - def should_store(self, value): - return isinstance(value, SparseArray) - - def set(self, locs, values, check=False): - assert locs.tolist() == [0] - self.values = values - - def get(self, item): - if self.ndim == 1: - loc = self.items.get_loc(item) - return self.values[loc] - else: - return self.values - - def _slice(self, slicer): - """ return a slice of my values (but densify first) """ - return self.get_values()[slicer] - - def get_values(self, dtype=None): - """ need to to_dense myself (and always return a ndim sized object) """ - values = self.values.to_dense() - if values.ndim == self.ndim - 1: - values = values.reshape((1,) + values.shape) - return values - def copy(self, deep=True): return self.make_block_same_class(values=self.values, sparse_index=self.sp_index, @@ -1797,9 +1959,6 @@ def sparse_reindex(self, new_index): return self.make_block_same_class(values, sparse_index=new_index, placement=self.mgr_locs) - def _try_cast_result(self, result, dtype=None): - return result - def make_block(values, placement, klass=None, ndim=None, dtype=None, fastpath=False): @@ -1823,6 +1982,8 @@ def make_block(values, placement, klass=None, ndim=None, klass = DatetimeBlock elif issubclass(vtype, np.complexfloating): klass = ComplexBlock + elif _is_categorical(values): + klass = CategoricalBlock else: @@ -1936,7 +2097,7 @@ def make_empty(self, axes=None): # preserve dtype if possible if self.ndim == 1: - blocks = np.array([], dtype=self.dtype) + blocks = np.array([], dtype=self.array_dtype) else: blocks = [] return self.__class__(blocks, axes) @@ -2599,6 +2760,7 @@ def iget(self, i, fastpath=True): # fastpath shortcut for select a single-dim from a 2-dim BM return SingleBlockManager([ block.make_block_same_class(values, placement=slice(0, len(values)), + ndim=1, fastpath=True) ], self.axes[1]) @@ -2660,11 +2822,20 @@ def set(self, item, value, check=False): if check, then validate that we are not setting the same data in-place """ # FIXME: refactor, clearly separate broadcasting & zip-like assignment + # can prob also fix the various if tests for sparse/categorical + value_is_sparse = isinstance(value, SparseArray) + value_is_cat = _is_categorical(value) + value_is_nonconsolidatable = value_is_sparse or value_is_cat if value_is_sparse: + # sparse assert self.ndim == 2 + def value_getitem(placement): + return value + elif value_is_cat: + # categorical def value_getitem(placement): return value else: @@ -2733,7 +2904,7 @@ def value_getitem(placement): unfit_count = len(unfit_mgr_locs) new_blocks = [] - if value_is_sparse: + if value_is_nonconsolidatable: # This code (ab-)uses the fact that sparse blocks contain only # one item. new_blocks.extend( @@ -2930,8 +3101,8 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): blk = self.blocks[blkno] # Otherwise, slicing along items axis is necessary. - if blk.is_sparse: - # A sparse block, it's easy, because there's only one item + if not blk._can_consolidate: + # A non-consolidatable block, it's easy, because there's only one item # and each mgr loc is a copy of that single item. for mgr_loc in mgr_locs: newblk = blk.copy(deep=True) @@ -3146,6 +3317,10 @@ def convert(self, **kwargs): def dtype(self): return self._values.dtype + @property + def array_dtype(self): + return self._block.array_dtype + @property def ftype(self): return self._block.ftype @@ -3166,6 +3341,10 @@ def get_ftypes(self): def values(self): return self._values.view() + def get_values(self): + """ return a dense type view """ + return np.array(self._block.to_dense(),copy=False) + @property def itemsize(self): return self._values.itemsize @@ -3250,6 +3429,7 @@ def form_blocks(arrays, names, axes): object_items = [] sparse_items = [] datetime_items = [] + cat_items = [] extra_locs = [] names_idx = Index(names) @@ -3290,6 +3470,8 @@ def form_blocks(arrays, names, axes): int_items.append((i, k, v)) elif v.dtype == np.bool_: bool_items.append((i, k, v)) + elif _is_categorical(v): + cat_items.append((i, k, v)) else: object_items.append((i, k, v)) @@ -3326,6 +3508,14 @@ def form_blocks(arrays, names, axes): sparse_blocks = _sparse_blockify(sparse_items) blocks.extend(sparse_blocks) + if len(cat_items) > 0: + cat_blocks = [ make_block(array, + klass=CategoricalBlock, + fastpath=True, + placement=[i] + ) for i, names, array in cat_items ] + blocks.extend(cat_blocks) + if len(extra_locs): shape = (len(extra_locs),) + tuple(len(x) for x in axes[1:]) @@ -3437,12 +3627,16 @@ def _lcd_dtype(l): have_complex = len(counts[ComplexBlock]) > 0 have_dt64 = len(counts[DatetimeBlock]) > 0 have_td64 = len(counts[TimeDeltaBlock]) > 0 + have_cat = len(counts[CategoricalBlock]) > 0 have_sparse = len(counts[SparseBlock]) > 0 have_numeric = have_float or have_complex or have_int + has_non_numeric = have_dt64 or have_td64 or have_cat + if (have_object or (have_bool and have_numeric) or - (have_numeric and (have_dt64 or have_td64))): + (have_numeric and has_non_numeric) or + have_cat): return np.dtype(object) elif have_bool: return np.dtype(bool) @@ -3731,7 +3925,9 @@ def get_empty_dtype_and_na(join_units): if dtype is None: continue - if issubclass(dtype.type, (np.object_, np.bool_)): + if com.is_categorical_dtype(dtype): + upcast_cls = 'category' + elif issubclass(dtype.type, (np.object_, np.bool_)): upcast_cls = 'object' elif is_datetime64_dtype(dtype): upcast_cls = 'datetime' @@ -3754,6 +3950,8 @@ def get_empty_dtype_and_na(join_units): # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan + elif 'category' in upcast_classes: + return com.CategoricalDtype(), np.nan elif 'float' in upcast_classes: return np.dtype(np.float64), np.nan elif 'datetime' in upcast_classes: @@ -3788,10 +3986,15 @@ def concatenate_join_units(join_units, concat_axis, copy): # FIXME: optimization potential: if len(join_units) == 1, single join unit # is densified and sparsified back. - if any(unit.is_sparse for unit in join_units): - # If one of the units was sparse, concat_values are 2d and there's only - # one item. - return SparseArray(concat_values[0]) + if any(unit.needs_block_conversion for unit in join_units): + + # need to ask the join unit block to convert to the underlying repr for us + blocks = [ unit.block for unit in join_units if unit.block is not None ] + + # may need to validate this combination + blocks[0]._validate_merge(blocks) + + return blocks[0]._holder(concat_values[0]) else: return concat_values @@ -4017,8 +4220,10 @@ def is_null(self): return True @cache_readonly - def is_sparse(self): - return self.block is not None and self.block.is_sparse + def needs_block_conversion(self): + """ we might need to convert the joined values to a suitable block repr """ + block = self.block + return block is not None and (block.is_sparse or block.is_categorical) def get_reindexed_values(self, empty_dtype, upcasted_na): if upcasted_na is None: diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 780edec6ea25b..abe1974705243 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -456,10 +456,12 @@ def na_op(x, y): result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], y[mask]) - else: + elif isinstance(x, pa.Array): result = pa.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) + else: + raise TypeError("{typ} cannot perform the operation {op}".format(typ=type(x).__name__,op=str_rep)) result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA) @@ -562,7 +564,7 @@ def wrapper(self, other): mask = isnull(self) - values = self.values + values = self.get_values() other = _index.convert_scalar(values, other) if issubclass(values.dtype.type, np.datetime64): @@ -749,12 +751,15 @@ def na_op(x, y): yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): result[mask] = op(xrav, yrav) - else: + elif hasattr(x,'size'): result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) xrav = xrav[mask] if np.prod(xrav.shape): result[mask] = op(xrav, y) + else: + raise TypeError("cannot perform operation {op} between objects " + "of type {x} and {y}".format(op=name,x=type(x),y=type(y))) result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index e9f8893355f2d..1e6ed56386f63 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -97,7 +97,7 @@ def panel_index(time, panels, names=['time', 'panel']): time_factor = Categorical.from_array(time) panel_factor = Categorical.from_array(panels) - labels = [time_factor.labels, panel_factor.labels] + labels = [time_factor.codes, panel_factor.codes] levels = [time_factor.levels, panel_factor.levels] return MultiIndex(levels, labels, sortorder=None, names=names, verify_integrity=False) @@ -1045,7 +1045,7 @@ def _apply_2d(self, func, axis): return self._construct_return_type(dict(results)) def _reduce(self, op, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): + filter_type=None, name=None, **kwds): axis_name = self._get_axis_name(axis) axis_number = self._get_axis_number(axis_name) f = lambda x: op(x, axis=axis_number, skipna=skipna, **kwds) diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index e1712be7b5a5f..43784e15ab163 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -1013,13 +1013,13 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): if dummy_na: number_of_cols += 1 - dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0) + dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) if dummy_na: levels = np.append(cat.levels, np.nan) else: # reset NaN GH4446 - dummy_mat[cat.labels == -1] = 0 + dummy_mat[cat.codes == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) @@ -1067,7 +1067,7 @@ def make_axis_dummies(frame, axis='minor', transform=None): if transform is not None: mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) - labels = cat.labels + labels = cat.codes items = cat.levels values = np.eye(len(items), dtype=float) diff --git a/pandas/core/series.py b/pandas/core/series.py index a484efe75e284..24092a8e9e978 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -199,9 +199,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None, else: data = data.reindex(index, copy=copy) elif isinstance(data, Categorical): + if dtype is not None: + raise ValueError("cannot specify a dtype with a Categorical") if name is None: name = data.name - data = np.asarray(data) elif isinstance(data, types.GeneratorType): data = list(data) elif isinstance(data, (set, frozenset)): @@ -369,7 +370,7 @@ def __array__(self, result=None): """ the array interface, return my values """ - return self.values + return self.get_values() def __array_wrap__(self, result, context=None): """ @@ -382,6 +383,14 @@ def __array_prepare__(self, result, context=None): """ Gets called prior to a ufunc """ + + # nice error message for non-ufunc types + if context is not None and not isinstance(self.values, np.ndarray): + obj = context[1][0] + raise TypeError("{obj} with dtype {dtype} cannot perform " + "the numpy op {op}".format(obj=type(obj).__name__, + dtype=getattr(obj,'dtype',None), + op=context[0].__name__)) return result # complex @@ -664,7 +673,10 @@ def _set_with(self, key, value): pass if not isinstance(key, (list, Series, pa.Array, Series)): - key = list(key) + try: + key = list(key) + except: + key = [ key ] if isinstance(key, Index): key_type = key.inferred_type @@ -994,7 +1006,7 @@ def values(self): def get_values(self): """ same as values (but handles sparseness conversions); is a view """ - return self._data.values + return self._data.get_values() def tolist(self): """ Convert Series to a nested list """ @@ -1387,8 +1399,8 @@ def dot(self, other): else: # pragma: no cover raise TypeError('unsupported type: %s' % type(other)) -#------------------------------------------------------------------------------ -# Combination + #------------------------------------------------------------------------------ + # Combination def append(self, to_append, verify_integrity=False): """ @@ -2004,9 +2016,19 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): index=self.index).__finalize__(self) def _reduce(self, op, axis=0, skipna=True, numeric_only=None, - filter_type=None, **kwds): - """ perform a reduction operation """ - return op(_values_from_object(self), skipna=skipna, **kwds) + filter_type=None, name=None, **kwds): + """ + perform a reduction operation + + if we have an ndarray as a value, then simply perform the operation, + otherwise delegate to the object + + """ + delegate = self.values + if isinstance(delegate, np.ndarray): + return op(delegate, skipna=skipna, **kwds) + return delegate._reduce(op=op, axis=axis, skipna=skipna, numeric_only=numeric_only, + filter_type=filter_type, name=name, **kwds) def _reindex_indexer(self, new_index, indexer, copy): if indexer is None: @@ -2377,6 +2399,14 @@ def to_period(self, freq=None, copy=True): new_index = self.index.to_period(freq=freq) return self._constructor(new_values, index=new_index).__finalize__(self) + #------------------------------------------------------------------------------ + # Categorical methods + + @property + def cat(self): + if not com.is_categorical_dtype(self.dtype): + raise TypeError("Can only use .cat accessor with a 'category' dtype") + return self.values Series._setup_axes(['index'], info_axis=0, stat_axis=0, aliases={'rows': 0}) @@ -2454,6 +2484,13 @@ def _try_cast(arr, take_fast_path): if copy: subarr = data.copy() + elif isinstance(data, Categorical): + subarr = data + + if copy: + subarr = data.copy() + return subarr + elif isinstance(data, list) and len(data) > 0: if dtype is not None: try: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index cee1867e73179..6d765fb314876 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -3494,7 +3494,7 @@ def read(self, where=None, columns=None, **kwargs): factors = [Categorical.from_array(a.values) for a in self.index_axes] levels = [f.levels for f in factors] N = [len(f.levels) for f in factors] - labels = [f.labels for f in factors] + labels = [f.codes for f in factors] # compute the key key = factor_indexer(N[1:], labels) diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index b045867b06263..1a2673342df45 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -198,6 +198,9 @@ def test_read_dta4(self): columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) + # these are all categoricals + expected = pd.concat([ Series(pd.Categorical(value)) for col, value in expected.iteritems() ],axis=1) + tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) tm.assert_frame_equal(parsed_115, expected) diff --git a/pandas/sparse/series.py b/pandas/sparse/series.py index 48576266c3b5f..bb428b7e4c6bb 100644 --- a/pandas/sparse/series.py +++ b/pandas/sparse/series.py @@ -213,6 +213,10 @@ def values(self): """ return the array """ return self._data._values + def __array__(self, result=None): + """ the array interface, return my values """ + return self._data._values + def get_values(self): """ same as values """ return self._data._values.to_dense().view() @@ -299,6 +303,11 @@ def __array_finalize__(self, obj): self.name = getattr(obj, 'name', None) self.fill_value = getattr(obj, 'fill_value', None) + def _reduce(self, op, axis=0, skipna=True, numeric_only=None, + filter_type=None, name=None, **kwds): + """ perform a reduction operation """ + return op(self.get_values(), skipna=skipna, **kwds) + def __getstate__(self): # pickling return dict(_typ=self._typ, diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ec2c64242f146..6353ad53a88ef 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -203,6 +203,7 @@ class TestValueCounts(tm.TestCase): _multiprocess_can_split_ = True def test_value_counts(self): + np.random.seed(1234) from pandas.tools.tile import cut arr = np.random.randn(4) @@ -212,7 +213,7 @@ def test_value_counts(self): result = algos.value_counts(factor) expected = algos.value_counts(np.asarray(factor)) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): s = [1, 2, 3, 4] diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a195b57382b95..24e65ce9c2f73 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4,20 +4,19 @@ from pandas.compat import range, lrange, u import nose import re +import os import numpy as np +import pandas as pd + +from pandas import (Categorical, Index, Int64Index, MultiIndex, + Series, DataFrame, PeriodIndex, Timestamp) -from pandas.core.categorical import Categorical -from pandas.core.index import Index, Int64Index, MultiIndex -from pandas.core.frame import DataFrame -from pandas.tseries.period import PeriodIndex from pandas.util.testing import assert_almost_equal import pandas.core.common as com -from pandas.tseries.period import PeriodIndex - +import pandas.compat as compat import pandas.util.testing as tm - class TestCategorical(tm.TestCase): _multiprocess_can_split_ = True @@ -30,18 +29,90 @@ def test_getitem(self): self.assertEqual(self.factor[-1], 'c') subf = self.factor[[0, 1, 2]] - tm.assert_almost_equal(subf.labels, [0, 1, 1]) + tm.assert_almost_equal(subf._codes, [0, 1, 1]) subf = self.factor[np.asarray(self.factor) == 'c'] - tm.assert_almost_equal(subf.labels, [2, 2, 2]) + tm.assert_almost_equal(subf._codes, [2, 2, 2]) def test_constructor_unsortable(self): - raise nose.SkipTest('skipping for now') - - arr = np.array([1, 2, 3, datetime.now()], dtype='O') # it works! + arr = np.array([1, 2, 3, datetime.now()], dtype='O') factor = Categorical.from_array(arr) + self.assertFalse(factor.ordered) + + def test_constructor(self): + # There are multiple ways to call a constructor + + # old style: two arrays, one a pointer to the labels + # old style is now only available with compat=True + exp_arr = np.array(["a", "b", "c", "a", "b", "c"]) + with tm.assert_produces_warning(FutureWarning): + c_old = Categorical([0,1,2,0,1,2], levels=["a","b","c"], compat=True) + self.assert_numpy_array_equal(c_old.__array__(), exp_arr) + # the next one are from the old docs + with tm.assert_produces_warning(FutureWarning): + c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3], compat=True) + self.assert_numpy_array_equal(c_old2.__array__(), np.array([1, 2, 3, 1, 2, 3])) + with tm.assert_produces_warning(FutureWarning): + c_old3 = Categorical([0,1,2,0,1,2], ['a', 'b', 'c'], compat=True) + self.assert_numpy_array_equal(c_old3.__array__(), np.array(['a', 'b', 'c', 'a', 'b', 'c'])) + + with tm.assert_produces_warning(FutureWarning): + cat = pd.Categorical([1,2], levels=[1,2,3], compat=True) + self.assert_numpy_array_equal(cat.__array__(), np.array([2,3])) + + with tm.assert_produces_warning(None): + cat = pd.Categorical([1,2], levels=[1,2,3], compat=False) + self.assert_numpy_array_equal(cat.__array__(), np.array([1,2])) + + # new style + c1 = Categorical(exp_arr) + self.assert_numpy_array_equal(c1.__array__(), exp_arr) + c2 = Categorical(exp_arr, levels=["a","b","c"]) + self.assert_numpy_array_equal(c2.__array__(), exp_arr) + c2 = Categorical(exp_arr, levels=["c","b","a"]) + self.assert_numpy_array_equal(c2.__array__(), exp_arr) + + # Categorical as input + c1 = Categorical(["a", "b", "c", "a"]) + c2 = Categorical(c1) + self.assertTrue(c1.equals(c2)) + + c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c2 = Categorical(c1) + self.assertTrue(c1.equals(c2)) + + c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c2 = Categorical(c1) + self.assertTrue(c1.equals(c2)) + + c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c2 = Categorical(c1, levels=["a","b","c"]) + self.assert_numpy_array_equal(c1.__array__(), c2.__array__()) + self.assert_numpy_array_equal(c2.levels, np.array(["a","b","c"])) + + # Series of dtype category + c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c2 = Categorical(Series(c1)) + self.assertTrue(c1.equals(c2)) + + c1 = Categorical(["a", "b", "c", "a"], levels=["a","c","b"]) + c2 = Categorical(Series(c1)) + self.assertTrue(c1.equals(c2)) + + # Series + c1 = Categorical(["a", "b", "c", "a"]) + c2 = Categorical(Series(["a", "b", "c", "a"])) + self.assertTrue(c1.equals(c2)) + + c1 = Categorical(["a", "b", "c", "a"], levels=["a","b","c","d"]) + c2 = Categorical(Series(["a", "b", "c", "a"]), levels=["a","b","c","d"]) + self.assertTrue(c1.equals(c2)) + + # This should result in integer levels, not float! + cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) + self.assertTrue(com.is_integer_dtype(cat.levels)) def test_factor_agg(self): import pandas.core.frame as frame @@ -50,9 +121,9 @@ def test_factor_agg(self): f = np.sum agged = frame.factor_agg(self.factor, arr, f) - labels = self.factor.labels + pointers = self.factor._codes for i, idx in enumerate(self.factor.levels): - self.assertEqual(f(arr[labels == i]), agged[i]) + self.assertEqual(f(arr[pointers == i]), agged[i]) def test_comparisons(self): result = self.factor[self.factor == 'a'] @@ -97,7 +168,7 @@ def test_na_flags_int_levels(self): labels = np.random.randint(0, 10, 20) labels[::5] = -1 - cat = Categorical(labels, levels) + cat = Categorical(labels, levels, fastpath=True) repr(cat) self.assert_numpy_array_equal(com.isnull(cat), labels == -1) @@ -127,7 +198,7 @@ def test_describe(self): def test_print(self): expected = [" a", " b", " b", " a", " a", " c", " c", " c", - "Levels (3): Index([a, b, c], dtype=object)"] + "Levels (3): Index([a, b, c], dtype=object), ordered"] expected = "\n".join(expected) # hack because array_repr changed in numpy > 1.6.x actual = repr(self.factor) @@ -138,11 +209,11 @@ def test_print(self): self.assertEqual(actual, expected) def test_big_print(self): - factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat') + factor = Categorical([0,1,2,0,1,2]*100, ['a', 'b', 'c'], name='cat', fastpath=True) expected = [" a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", "...", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", " a", " b", " c", - "Levels (3): Index([a, b, c], dtype=object)", + "Levels (3): Index([a, b, c], dtype=object), unordered", "Name: cat, Length: 600" ] expected = "\n".join(expected) @@ -157,7 +228,7 @@ def test_big_print(self): def test_empty_print(self): factor = Categorical([], ["a","b","c"], name="cat") expected = ("Categorical([], Name: cat, Levels (3): " - "Index([a, b, c], dtype=object)") + "Index([a, b, c], dtype=object), ordered") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" @@ -168,7 +239,7 @@ def test_empty_print(self): factor = Categorical([], ["a","b","c"]) expected = ("Categorical([], Levels (3): " - "Index([a, b, c], dtype=object)") + "Index([a, b, c], dtype=object), ordered") # hack because array_repr changed in numpy > 1.6.x actual = repr(factor) pat = "Index\(\['a', 'b', 'c']" @@ -179,44 +250,1237 @@ def test_empty_print(self): factor = Categorical([], []) expected = ("Categorical([], Levels (0): " - "Index([], dtype=object)") + "Index([], dtype=object), ordered") self.assertEqual(repr(factor), expected) def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') - cat1 = Categorical.from_array(idx1) - exp_arr = np.array([0, 0, 1, 1, 2, 2]) + cat1 = Categorical.from_array(idx1) + str(cat1) + exp_arr = np.array([0, 0, 1, 1, 2, 2],dtype='int64') exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') - - self.assert_numpy_array_equal(cat1.labels, exp_arr) + self.assert_numpy_array_equal(cat1._codes, exp_arr) self.assertTrue(cat1.levels.equals(exp_idx)) - idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') cat2 = Categorical.from_array(idx2) - - exp_arr = np.array([2, 2, 1, 0, 2, 0]) - - self.assert_numpy_array_equal(cat2.labels, exp_arr) - self.assertTrue(cat2.levels.equals(exp_idx)) + str(cat2) + exp_arr = np.array([2, 2, 1, 0, 2, 0],dtype='int64') + exp_idx2 = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') + self.assert_numpy_array_equal(cat2._codes, exp_arr) + self.assertTrue(cat2.levels.equals(exp_idx2)) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') cat3 = Categorical.from_array(idx3) - - exp_arr = np.array([6, 5, 4, 3, 2, 1, 0]) + exp_arr = np.array([6, 5, 4, 3, 2, 1, 0],dtype='int64') exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') - - self.assert_numpy_array_equal(cat3.labels, exp_arr) + self.assert_numpy_array_equal(cat3._codes, exp_arr) self.assertTrue(cat3.levels.equals(exp_idx)) + def test_level_assigments(self): + s = pd.Categorical(["a","b","c","a"]) + exp = np.array([1,2,3,1]) + s.levels = [1,2,3] + self.assert_numpy_array_equal(s.__array__(), exp) + self.assert_numpy_array_equal(s.levels, np.array([1,2,3])) + # lengthen + s.levels = [1,2,3,4] + # does nothing to the values but only the the levels + self.assert_numpy_array_equal(s.__array__(), exp) + self.assert_numpy_array_equal(s.levels, np.array([1,2,3,4])) + # shorten + exp2 = np.array([1,2,np.nan,1]) + s.levels = [1,2] + self.assert_numpy_array_equivalent(s.__array__(), exp2) # doesn't work with nan :-( + self.assertTrue(np.isnan(s.__array__()[2])) + self.assert_numpy_array_equal(s.levels, np.array([1,2])) + + def test_category_reorder_levels(self): + cat = Categorical(["a","b","c","a"], ordered=True) + exp_levels = np.array(["c","b","a"]) + exp_values = np.array(["a","b","c","a"]) + cat.reorder_levels(["c","b","a"]) + self.assert_numpy_array_equal(cat.levels, exp_levels) + self.assert_numpy_array_equal(cat.__array__(), exp_values) + + def f(): + cat.reorder_levels(["a"]) + self.assertRaises(ValueError, f) + + def f(): + cat.reorder_levels(["a","b","d"]) + self.assertRaises(ValueError, f) + + def f(): + cat.reorder_levels(["a","b","c", "d"]) + self.assertRaises(ValueError, f) + + # internals... + c = Categorical([1,2,3,4,1], levels=[1,2,3,4]) + self.assert_numpy_array_equal(c._codes, np.array([0,1,2,3,0])) + self.assert_numpy_array_equal(c.levels , np.array([1,2,3,4] )) + self.assert_numpy_array_equal(c.get_values() , np.array([1,2,3,4,1] )) + c.reorder_levels([4,3,2,1]) # all "pointers" to '4' must be changed from 3 to 0,... + self.assert_numpy_array_equal(c._codes , np.array([3,2,1,0,3])) # positions are changed + self.assert_numpy_array_equal(c.levels , np.array([4,3,2,1])) # levels are now in new order + self.assert_numpy_array_equal(c.get_values() , np.array([1,2,3,4,1])) # output is the same + self.assertTrue(c.min(), 4) + self.assertTrue(c.max(), 1) + + def f(): + c.reorder_levels([4,3,2,10]) + self.assertRaises(ValueError, f) + + def test_remove_unused_levels(self): + c = Categorical(["a","b","c","d","a"], levels=["a","b","c","d","e"]) + self.assert_numpy_array_equal(c.levels , np.array(["a","b","c","d","e"])) + c.remove_unused_levels() + self.assert_numpy_array_equal(c.levels , np.array(["a","b","c","d"])) + + def test_nan_handling(self): + + # Nans are represented as -1 in labels + c = Categorical(["a","b",np.nan,"a"]) + self.assert_numpy_array_equal(c.levels , np.array(["a","b"])) + self.assert_numpy_array_equal(c._codes , np.array([0,1,-1,0])) + + # If levels have nan included, the label should point to that instead + c = Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan]) + self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + + # Changing levels should also make the replaced level np.nan + c = Categorical(["a","b","c","a"]) + c.levels = ["a","b",np.nan] + self.assert_numpy_array_equal(c.levels , np.array(["a","b",np.nan],dtype=np.object_)) + self.assert_numpy_array_equal(c._codes , np.array([0,1,2,0])) + + + def test_min_max(self): + + # unordered cats have no min/max + cat = Categorical(["a","b","c","d"], ordered=False) + self.assertRaises(TypeError, lambda : cat.min()) + self.assertRaises(TypeError, lambda : cat.max()) + cat = Categorical(["a","b","c","d"], ordered=True) + _min = cat.min() + _max = cat.max() + self.assertEqual(_min, "a") + self.assertEqual(_max, "d") + cat = Categorical(["a","b","c","d"], levels=['d','c','b','a'], ordered=True) + _min = cat.min() + _max = cat.max() + self.assertEqual(_min, "d") + self.assertEqual(_max, "a") + cat = Categorical([np.nan,"b","c",np.nan], levels=['d','c','b','a'], ordered=True) + _min = cat.min() + _max = cat.max() + self.assertTrue(np.isnan(_min)) + self.assertEqual(_max, "b") + + _min = cat.min(numeric_only=True) + self.assertEqual(_min, "c") + _max = cat.max(numeric_only=True) + self.assertEqual(_max, "b") + + cat = Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True) + _min = cat.min() + _max = cat.max() + self.assertTrue(np.isnan(_min)) + self.assertEqual(_max, 1) + + _min = cat.min(numeric_only=True) + self.assertEqual(_min, 2) + _max = cat.max(numeric_only=True) + self.assertEqual(_max, 1) + + + def test_mode(self): + s = Categorical([1,1,2,4,5,5,5], levels=[5,4,3,2,1], ordered=True) + res = s.mode() + exp = Categorical([5], levels=[5,4,3,2,1], ordered=True) + self.assertTrue(res.equals(exp)) + s = Categorical([1,1,1,4,5,5,5], levels=[5,4,3,2,1], ordered=True) + res = s.mode() + exp = Categorical([5,1], levels=[5,4,3,2,1], ordered=True) + self.assertTrue(res.equals(exp)) + s = Categorical([1,2,3,4,5], levels=[5,4,3,2,1], ordered=True) + res = s.mode() + exp = Categorical([], levels=[5,4,3,2,1], ordered=True) + self.assertTrue(res.equals(exp)) + # NaN should not become the mode! + s = Categorical([np.nan,np.nan,np.nan,4,5], levels=[5,4,3,2,1], ordered=True) + res = s.mode() + exp = Categorical([], levels=[5,4,3,2,1], ordered=True) + self.assertTrue(res.equals(exp)) + s = Categorical([np.nan,np.nan,np.nan,4,5,4], levels=[5,4,3,2,1], ordered=True) + res = s.mode() + exp = Categorical([4], levels=[5,4,3,2,1], ordered=True) + self.assertTrue(res.equals(exp)) + s = Categorical([np.nan,np.nan,4,5,4], levels=[5,4,3,2,1], ordered=True) + res = s.mode() + exp = Categorical([4], levels=[5,4,3,2,1], ordered=True) + self.assertTrue(res.equals(exp)) + + + def test_sort(self): + + # unordered cats are not sortable + cat = Categorical(["a","b","b","a"], ordered=False) + self.assertRaises(TypeError, lambda : cat.sort()) + cat = Categorical(["a","c","b","d"], ordered=True) + + # order + res = cat.order() + exp = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp) + + cat = Categorical(["a","c","b","d"], levels=["a","b","c","d"], ordered=True) + res = cat.order() + exp = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp) + + res = cat.order(ascending=False) + exp = np.array(["d","c","b","a"],dtype=object) + self.assert_numpy_array_equal(res.__array__(), exp) + + # sort (inplace order) + cat1 = cat.copy() + cat1.sort() + exp = np.array(["a","b","c","d"],dtype=object) + self.assert_numpy_array_equal(cat1.__array__(), exp) + + def test_slicing_directly(self): + cat = Categorical(["a","b","c","d","a","b","c"]) + sliced = cat[3] + tm.assert_equal(sliced, "d") + sliced = cat[3:5] + expected = Categorical(["d","a"], levels=['a', 'b', 'c', 'd']) + self.assert_numpy_array_equal(sliced._codes, expected._codes) + tm.assert_index_equal(sliced.levels, expected.levels) + +class TestCategoricalAsBlock(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + self.factor = Categorical.from_array(['a', 'b', 'b', 'a', + 'a', 'c', 'c', 'c']) + + df = DataFrame({'value': np.random.randint(0, 10000, 100)}) + labels = [ "{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500) ] + + df = df.sort(columns=['value'], ascending=True) + df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + self.cat = df + + def test_dtypes(self): + + dtype = com.CategoricalDtype() + hash(dtype) + self.assertTrue(com.is_categorical_dtype(dtype)) + + s = Series(self.factor,name='A') + + # dtypes + self.assertTrue(com.is_categorical_dtype(s.dtype)) + self.assertTrue(com.is_categorical_dtype(s)) + self.assertFalse(com.is_categorical_dtype(np.dtype('float64'))) + + # np.dtype doesn't know about our new dtype + def f(): + np.dtype(dtype) + self.assertRaises(TypeError, f) + + self.assertFalse(dtype == np.str_) + self.assertFalse(np.str_ == dtype) + + def test_basic(self): + + # test basic creation / coercion of categoricals + s = Series(self.factor,name='A') + self.assertEqual(s.dtype,'category') + self.assertEqual(len(s),len(self.factor)) + str(s.values) + str(s) + + # in a frame + df = DataFrame({'A' : self.factor }) + result = df['A'] + tm.assert_series_equal(result,s) + result = df.iloc[:,0] + tm.assert_series_equal(result,s) + self.assertEqual(len(df),len(self.factor)) + str(df.values) + str(df) + + df = DataFrame({'A' : s }) + result = df['A'] + tm.assert_series_equal(result,s) + self.assertEqual(len(df),len(self.factor)) + str(df.values) + str(df) + + # multiples + df = DataFrame({'A' : s, 'B' : s, 'C' : 1}) + result1 = df['A'] + result2 = df['B'] + tm.assert_series_equal(result1,s) + tm.assert_series_equal(result2,s) + self.assertEqual(len(df),len(self.factor)) + str(df.values) + str(df) + + def test_creation_astype(self): + l = ["a","b","c","a"] + s = pd.Series(l) + exp = pd.Series(Categorical(l)) + res = s.astype('category') + tm.assert_series_equal(res, exp) + + l = [1,2,3,1] + s = pd.Series(l) + exp = pd.Series(Categorical(l)) + res = s.astype('category') + tm.assert_series_equal(res, exp) + + def test_sideeffects_free(self): + + # Passing a categorical to a Series and then changing values in either the series or the + # categorical should not change the values in the other one! + cat = Categorical(["a","b","c","a"]) + s = pd.Series(cat, copy=True) + self.assertFalse(s.cat is cat) + s.cat.levels = [1,2,3] + exp_s = np.array([1,2,3,1]) + exp_cat = np.array(["a","b","c","a"]) + self.assert_numpy_array_equal(s.__array__(), exp_s) + self.assert_numpy_array_equal(cat.__array__(), exp_cat) + + # setting + s[0] = 2 + exp_s2 = np.array([2,2,3,1]) + self.assert_numpy_array_equal(s.__array__(), exp_s2) + self.assert_numpy_array_equal(cat.__array__(), exp_cat) + + # however, copy is False by default + # so this WILL change values + cat = Categorical(["a","b","c","a"]) + s = pd.Series(cat) + self.assertTrue(s.cat is cat) + s.cat.levels = [1,2,3] + exp_s = np.array([1,2,3,1]) + self.assert_numpy_array_equal(s.__array__(), exp_s) + self.assert_numpy_array_equal(cat.__array__(), exp_s) + + s[0] = 2 + exp_s2 = np.array([2,2,3,1]) + self.assert_numpy_array_equal(s.__array__(), exp_s2) + self.assert_numpy_array_equal(cat.__array__(), exp_s2) + + def test_nan_handling(self): + + # Nans are represented as -1 in labels + s = Series(Categorical(["a","b",np.nan,"a"])) + self.assert_numpy_array_equal(s.cat.levels, np.array(["a","b"])) + self.assert_numpy_array_equal(s.cat._codes, np.array([0,1,-1,0])) + + # If levels have nan included, the label should point to that instead + s2 = Series(Categorical(["a","b",np.nan,"a"], levels=["a","b",np.nan])) + self.assert_numpy_array_equal(s2.cat.levels, + np.array(["a","b",np.nan], dtype=np.object_)) + self.assert_numpy_array_equal(s2.cat._codes, np.array([0,1,2,0])) + + # Changing levels should also make the replaced level np.nan + s3 = Series(Categorical(["a","b","c","a"])) + s3.cat.levels = ["a","b",np.nan] + self.assert_numpy_array_equal(s3.cat.levels, + np.array(["a","b",np.nan], dtype=np.object_)) + self.assert_numpy_array_equal(s3.cat._codes, np.array([0,1,2,0])) + + + def test_series_delegations(self): + + # invalid accessor + self.assertRaises(TypeError, lambda : Series([1,2,3]).cat) + tm.assertRaisesRegexp(TypeError, + r"Can only use .cat accessor with a 'category' dtype", + lambda : Series([1,2,3]).cat) + self.assertRaises(TypeError, lambda : Series(['a','b','c']).cat) + self.assertRaises(TypeError, lambda : Series(np.arange(5.)).cat) + self.assertRaises(TypeError, lambda : Series([Timestamp('20130101')]).cat) + + # Series should delegate calls to '.level', '.ordered' and '.reorder()' to the categorical + s = Series(Categorical(["a","b","c","a"], ordered=True)) + exp_levels = np.array(["a","b","c"]) + self.assert_numpy_array_equal(s.cat.levels, exp_levels) + + s.cat.levels = [1,2,3] + exp_levels = np.array([1,2,3]) + self.assert_numpy_array_equal(s.cat.levels, exp_levels) + self.assertEqual(s.cat.ordered, True) + s.cat.ordered = False + self.assertEqual(s.cat.ordered, False) + + # reorder + s = Series(Categorical(["a","b","c","a"], ordered=True)) + exp_levels = np.array(["c","b","a"]) + exp_values = np.array(["a","b","c","a"]) + s.cat.reorder_levels(["c","b","a"]) + self.assert_numpy_array_equal(s.cat.levels, exp_levels) + self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.__array__(), exp_values) + + # remove unused levels + s = Series(Categorical(["a","b","b","a"], levels=["a","b","c"])) + exp_levels = np.array(["a","b"]) + exp_values = np.array(["a","b","b","a"]) + s.cat.remove_unused_levels() + self.assert_numpy_array_equal(s.cat.levels, exp_levels) + self.assert_numpy_array_equal(s.cat.__array__(), exp_values) + self.assert_numpy_array_equal(s.__array__(), exp_values) + + # This method is likely to be confused, so test that it raises an error on wrong inputs: + def f(): + s.reorder_levels([4,3,2,1]) + self.assertRaises(Exception, f) + # right: s.cat.reorder_levels([4,3,2,1]) + + def test_series_functions_no_warnings(self): + df = pd.DataFrame({'value': np.random.randint(0, 100, 20)}) + labels = [ "{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] + with tm.assert_produces_warning(False): + df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels) + + def test_assignment_to_dataframe(self): + # assignment + df = DataFrame({'value': np.array(np.random.randint(0, 10000, 100),dtype='int32')}) + labels = [ "{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500) ] + + df = df.sort(columns=['value'], ascending=True) + d = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) + s = Series(d) + df['D'] = d + str(df) + + result = df.dtypes + expected = Series([np.dtype('int32'), com.CategoricalDtype()],index=['value','D']) + tm.assert_series_equal(result,expected) + + df['E'] = s + str(df) + + result = df.dtypes + expected = Series([np.dtype('int32'), com.CategoricalDtype(), com.CategoricalDtype()], + index=['value','D','E']) + tm.assert_series_equal(result,expected) + + result1 = df['D'] + result2 = df['E'] + self.assertTrue(result1._data._block.values.equals(d)) + + # sorting + s.name = 'E' + self.assertTrue(result2.sort_index().equals(s)) + + # FIXME? + #### what does this compare to? ### + result = df.sort_index() + + cat = pd.Categorical([1,2,3,10], levels=[1,2,3,4,10]) + df = pd.DataFrame(pd.Series(cat)) + + def test_describe(self): + + ###FIXME### + # This should(?) only includes the value column, not the value_group + result = self.cat['value_group'].describe() + result = self.cat.describe() + self.assertEquals(len(result.columns),1) + + s = Series(Categorical(["a","b","b","b"], levels=['a','b','c'], ordered=True)) + result = s.cat.describe() + + expected = DataFrame([[1,0.25],[3,0.75],[np.nan,np.nan]], + columns=['counts','freqs'], + index=Index(['a','b','c'],name='levels')) + tm.assert_frame_equal(result,expected) + + ###FIXME###x: this correctly returns 2 for the uniques, should we add an attribute levels? + result = s.describe() + expected = Series([4,2],index=['count','unique']) + tm.assert_series_equal(result,expected) + + def test_groupby(self): + + result = self.cat['value_group'].unique() + result = self.cat.groupby(['value_group'])['value_group'].count() + + def test_groupby_sort(self): + + # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby + # This should result in a properly sorted Series so that the plot + # has a sorted x axis + #self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar') + + res = self.cat.groupby(['value_group'])['value_group'].count() + exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))] + tm.assert_series_equal(res, exp) + + def test_min_max(self): + # unordered cats have no min/max + cat = Series(Categorical(["a","b","c","d"], ordered=False)) + self.assertRaises(TypeError, lambda : cat.min()) + self.assertRaises(TypeError, lambda : cat.max()) + + cat = Series(Categorical(["a","b","c","d"], ordered=True)) + _min = cat.min() + _max = cat.max() + self.assertEqual(_min, "a") + self.assertEqual(_max, "d") + + cat = Series(Categorical(["a","b","c","d"], levels=['d','c','b','a'], ordered=True)) + _min = cat.min() + _max = cat.max() + self.assertEqual(_min, "d") + self.assertEqual(_max, "a") + + cat = Series(Categorical([np.nan,"b","c",np.nan], levels=['d','c','b','a'], ordered=True)) + _min = cat.min() + _max = cat.max() + self.assertTrue(np.isnan(_min)) + self.assertEqual(_max, "b") + + cat = Series(Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True)) + _min = cat.min() + _max = cat.max() + self.assertTrue(np.isnan(_min)) + self.assertEqual(_max, 1) + + def test_mode(self): + s = Series(Categorical([1,1,2,4,5,5,5], levels=[5,4,3,2,1], ordered=True)) + res = s.mode() + exp = Series(Categorical([5], levels=[5,4,3,2,1], ordered=True)) + tm.assert_series_equal(res, exp) + s = Series(Categorical([1,1,1,4,5,5,5], levels=[5,4,3,2,1], ordered=True)) + res = s.mode() + exp = Series(Categorical([5,1], levels=[5,4,3,2,1], ordered=True)) + tm.assert_series_equal(res, exp) + s = Series(Categorical([1,2,3,4,5], levels=[5,4,3,2,1], ordered=True)) + res = s.mode() + exp = Series(Categorical([], levels=[5,4,3,2,1], ordered=True)) + tm.assert_series_equal(res, exp) + + def test_value_counts(self): + + s = pd.Series(pd.Categorical(["a","b","c","c","c","b"], levels=["c","a","b","d"])) + res = s.value_counts(sort=False) + exp = Series([3,1,2,0], index=["c","a","b","d"]) + tm.assert_series_equal(res, exp) + res = s.value_counts(sort=True) + exp = Series([3,2,1,0], index=["c","b","a","d"]) + tm.assert_series_equal(res, exp) + + def test_groupby(self): + + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"]) + data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) + + result = data.groupby("b").mean() + result = result["a"].values + exp = np.array([1,2,4,np.nan]) + self.assert_numpy_array_equivalent(result, exp) + + ### FIXME ### + + #res = len(data.groupby("b")) + #self.assertEqual(res ,4) + + raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"]) + raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"]) + df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) + gb = df.groupby("A") + + #idx = gb.indices + #self.assertEqual(len(gb), 3) + #num = 0 + #for _ in gb: + # num +=1 + #self.assertEqual(len(gb), 3) + #gb = df.groupby(["B"]) + #idx2 = gb.indices + #self.assertEqual(len(gb), 3) + #num = 0 + #for _ in gb: + # num +=1 + #self.assertEqual(len(gb), 3) + #gb = df.groupby(["A","B"]) + #res = len(gb) + #idx3 = gb.indices + #self.assertEqual(res, 9) + #num = 0 + #for _ in gb: + # num +=1 + #self.assertEqual(len(gb), 9) + + def test_pivot_table(self): + + raw_cat1 = Categorical(["a","a","b","b"], levels=["a","b","z"]) + raw_cat2 = Categorical(["c","d","c","d"], levels=["c","d","y"]) + df = DataFrame({"A":raw_cat1,"B":raw_cat2, "values":[1,2,3,4]}) + res = pd.pivot_table(df, values='values', index=['A', 'B']) + + ### FIXME ### + #self.assertEqual(len(res), 9) + + def test_count(self): + + s = Series(Categorical([np.nan,1,2,np.nan], levels=[5,4,3,2,1], ordered=True)) + result = s.count() + self.assertEqual(result, 2) + + def test_sort(self): + + # unordered cats are not sortable + cat = Series(Categorical(["a","b","b","a"], ordered=False)) + self.assertRaises(TypeError, lambda : cat.sort()) + + cat = Series(Categorical(["a","c","b","d"], ordered=True)) + + res = cat.order() + exp = np.array(["a","b","c","d"]) + self.assert_numpy_array_equal(res.__array__(), exp) + + cat = Series(Categorical(["a","c","b","d"], levels=["a","b","c","d"], ordered=True)) + res = cat.order() + exp = np.array(["a","b","c","d"]) + self.assert_numpy_array_equal(res.__array__(), exp) + + res = cat.order(ascending=False) + exp = np.array(["d","c","b","a"]) + self.assert_numpy_array_equal(res.__array__(), exp) + + raw_cat1 = Categorical(["a","b","c","d"], levels=["a","b","c","d"], ordered=False) + raw_cat2 = Categorical(["a","b","c","d"], levels=["d","c","b","a"]) + s = ["a","b","c","d"] + df = DataFrame({"unsort":raw_cat1,"sort":raw_cat2, "string":s, "values":[1,2,3,4]}) + + # Cats must be sorted in a dataframe + res = df.sort(columns=["string"], ascending=False) + exp = np.array(["d", "c", "b", "a"]) + self.assert_numpy_array_equal(res["sort"].cat.__array__(), exp) + self.assertEqual(res["sort"].dtype, "category") + + res = df.sort(columns=["sort"], ascending=False) + exp = df.sort(columns=["string"], ascending=True) + self.assert_numpy_array_equal(res["values"], exp["values"]) + self.assertEqual(res["sort"].dtype, "category") + self.assertEqual(res["unsort"].dtype, "category") + + def f(): + df.sort(columns=["unsort"], ascending=False) + self.assertRaises(TypeError, f) + + + def test_slicing(self): + cat = Series(Categorical([1,2,3,4])) + reversed = cat[::-1] + exp = np.array([4,3,2,1]) + self.assert_numpy_array_equal(reversed.__array__(), exp) + + df = DataFrame({'value': (np.arange(100)+1).astype('int64')}) + df['D'] = pd.cut(df.value, bins=[0,25,50,75,100]) + + expected = Series([11,'(0, 25]'],index=['value','D']) + result = df.iloc[10] + tm.assert_series_equal(result,expected) + + expected = DataFrame({'value': np.arange(11,21).astype('int64')}, + index=np.arange(10,20).astype('int64')) + expected['D'] = pd.cut(expected.value, bins=[0,25,50,75,100]) + result = df.iloc[10:20] + tm.assert_frame_equal(result,expected) + + expected = Series([9,'(0, 25]'],index=['value','D']) + result = df.loc[8] + tm.assert_series_equal(result,expected) + + def test_slicing_and_getting_ops(self): + + # systematically test the slicing operations: + # for all slicing ops: + # - returning a dataframe + # - returning a column + # - returning a row + # - returning a single value + + cats = pd.Categorical(["a","c","b","c","c","c","c"], levels=["a","b","c"]) + idx = pd.Index(["h","i","j","k","l","m","n"]) + values= [1,2,3,4,5,6,7] + df = pd.DataFrame({"cats":cats,"values":values}, index=idx) + + # the expected values + cats2 = pd.Categorical(["b","c"], levels=["a","b","c"]) + idx2 = pd.Index(["j","k"]) + values2= [3,4] + + # 2:4,: | "j":"k",: + exp_df = pd.DataFrame({"cats":cats2,"values":values2}, index=idx2) + + # :,"cats" | :,0 + exp_col = pd.Series(cats,index=idx,name='cats') + + # "j",: | 2,: + exp_row = pd.Series(["b",3], index=["cats","values"], dtype="object", name="j") + + # "j","cats | 2,0 + exp_val = "b" + + # iloc + # frame + res_df = df.iloc[2:4,:] + tm.assert_frame_equal(res_df, exp_df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + # row + res_row = df.iloc[2,:] + tm.assert_series_equal(res_row, exp_row) + tm.assert_isinstance(res_row["cats"], compat.string_types) + + # col + res_col = df.iloc[:,0] + tm.assert_series_equal(res_col, exp_col) + self.assertTrue(com.is_categorical_dtype(res_col)) + + # single value + res_val = df.iloc[2,0] + self.assertEqual(res_val, exp_val) + + # loc + # frame + res_df = df.loc["j":"k",:] + tm.assert_frame_equal(res_df, exp_df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + # row + res_row = df.loc["j",:] + tm.assert_series_equal(res_row, exp_row) + tm.assert_isinstance(res_row["cats"], compat.string_types) + + # col + res_col = df.loc[:,"cats"] + tm.assert_series_equal(res_col, exp_col) + self.assertTrue(com.is_categorical_dtype(res_col)) + + # single value + res_val = df.loc["j","cats"] + self.assertEqual(res_val, exp_val) + + # ix + # frame + #res_df = df.ix["j":"k",[0,1]] # doesn't work? + res_df = df.ix["j":"k",:] + tm.assert_frame_equal(res_df, exp_df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + # row + res_row = df.ix["j",:] + tm.assert_series_equal(res_row, exp_row) + tm.assert_isinstance(res_row["cats"], compat.string_types) + + # col + res_col = df.ix[:,"cats"] + tm.assert_series_equal(res_col, exp_col) + self.assertTrue(com.is_categorical_dtype(res_col)) + + # single value + res_val = df.ix["j",0] + self.assertEqual(res_val, exp_val) + + # iat + res_val = df.iat[2,0] + self.assertEqual(res_val, exp_val) + + # at + res_val = df.at["j","cats"] + self.assertEqual(res_val, exp_val) + + # fancy indexing + exp_fancy = df.iloc[[2]] + + res_fancy = df[df["cats"] == "b"] + tm.assert_frame_equal(res_fancy,exp_fancy) + res_fancy = df[df["values"] == 3] + tm.assert_frame_equal(res_fancy,exp_fancy) + + # get_value + res_val = df.get_value("j","cats") + self.assertEqual(res_val, exp_val) + + # i : int, slice, or sequence of integers + res_row = df.irow(2) + tm.assert_series_equal(res_row, exp_row) + tm.assert_isinstance(res_row["cats"], compat.string_types) + + res_df = df.irow(slice(2,4)) + tm.assert_frame_equal(res_df, exp_df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + res_df = df.irow([2,3]) + tm.assert_frame_equal(res_df, exp_df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + res_col = df.icol(0) + tm.assert_series_equal(res_col, exp_col) + self.assertTrue(com.is_categorical_dtype(res_col)) + + res_df = df.icol(slice(0,2)) + tm.assert_frame_equal(res_df, df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + res_df = df.icol([0,1]) + tm.assert_frame_equal(res_df, df) + self.assertTrue(com.is_categorical_dtype(res_df["cats"])) + + def test_assigning_ops(self): + + # systematically test the assigning operations: + # for all slicing ops: + # for value in levels and value not in levels: + # - assign a single value -> exp_single_cats_value + # - assign a complete row (mixed values) -> exp_single_row + # - assign multiple rows (mixed values) (-> array) -> exp_multi_row + # - assign a part of a column with dtype == categorical -> exp_parts_cats_col + # - assign a part of a column with dtype != categorical -> exp_parts_cats_col + + cats = pd.Categorical(["a","a","a","a","a","a","a"], levels=["a","b"]) + idx = pd.Index(["h","i","j","k","l","m","n"]) + values = [1,1,1,1,1,1,1] + orig = pd.DataFrame({"cats":cats,"values":values}, index=idx) + + ### the expected values + # changed single row + cats1 = pd.Categorical(["a","a","b","a","a","a","a"], levels=["a","b"]) + idx1 = pd.Index(["h","i","j","k","l","m","n"]) + values1 = [1,1,2,1,1,1,1] + exp_single_row = pd.DataFrame({"cats":cats1,"values":values1}, index=idx1) + + #changed multiple rows + cats2 = pd.Categorical(["a","a","b","b","a","a","a"], levels=["a","b"]) + idx2 = pd.Index(["h","i","j","k","l","m","n"]) + values2 = [1,1,2,2,1,1,1] + exp_multi_row = pd.DataFrame({"cats":cats2,"values":values2}, index=idx2) + + # changed part of the cats column + cats3 = pd.Categorical(["a","a","b","b","a","a","a"], levels=["a","b"]) + idx3 = pd.Index(["h","i","j","k","l","m","n"]) + values3 = [1,1,1,1,1,1,1] + exp_parts_cats_col = pd.DataFrame({"cats":cats3,"values":values3}, index=idx3) + + # changed single value in cats col + cats4 = pd.Categorical(["a","a","b","a","a","a","a"], levels=["a","b"]) + idx4 = pd.Index(["h","i","j","k","l","m","n"]) + values4 = [1,1,1,1,1,1,1] + exp_single_cats_value = pd.DataFrame({"cats":cats4,"values":values4}, index=idx4) + + #### iloc ##### + ################ + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.iloc[2,0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current level set + def f(): + df = orig.copy() + df.iloc[2,0] = "c" + self.assertRaises(ValueError, f) + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.iloc[2,:] = ["b",2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in level set + def f(): + df = orig.copy() + df.iloc[2,:] = ["c",2] + self.assertRaises(ValueError, f) + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.iloc[2:4,:] = [["b",2],["b",2]] + tm.assert_frame_equal(df, exp_multi_row) + + def f(): + df = orig.copy() + df.iloc[2:4,:] = [["c",2],["c",2]] + self.assertRaises(ValueError, f) + + # - assign a part of a column with dtype == categorical -> exp_parts_cats_col + df = orig.copy() + df.iloc[2:4,0] = pd.Categorical(["b","b"], levels=["a","b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with tm.assertRaises(ValueError): + # different levels -> not sure if this should fail or pass + df = orig.copy() + df.iloc[2:4,0] = pd.Categorical(["b","b"], levels=["a","b","c"]) + + with tm.assertRaises(ValueError): + # different values + df = orig.copy() + df.iloc[2:4,0] = pd.Categorical(["c","c"], levels=["a","b","c"]) + + # - assign a part of a column with dtype != categorical -> exp_parts_cats_col + df = orig.copy() + df.iloc[2:4,0] = ["b","b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with tm.assertRaises(ValueError): + df.iloc[2:4,0] = ["c","c"] + + #### loc ##### + ################ + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.loc["j","cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current level set + def f(): + df = orig.copy() + df.loc["j","cats"] = "c" + self.assertRaises(ValueError, f) + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.loc["j",:] = ["b",2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in level set + def f(): + df = orig.copy() + df.loc["j",:] = ["c",2] + self.assertRaises(ValueError, f) + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.loc["j":"k",:] = [["b",2],["b",2]] + tm.assert_frame_equal(df, exp_multi_row) + + def f(): + df = orig.copy() + df.loc["j":"k",:] = [["c",2],["c",2]] + self.assertRaises(ValueError, f) + + # - assign a part of a column with dtype == categorical -> exp_parts_cats_col + df = orig.copy() + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with tm.assertRaises(ValueError): + # different levels -> not sure if this should fail or pass + df = orig.copy() + df.loc["j":"k","cats"] = pd.Categorical(["b","b"], levels=["a","b","c"]) + + with tm.assertRaises(ValueError): + # different values + df = orig.copy() + df.loc["j":"k","cats"] = pd.Categorical(["c","c"], levels=["a","b","c"]) + + # - assign a part of a column with dtype != categorical -> exp_parts_cats_col + df = orig.copy() + df.loc["j":"k","cats"] = ["b","b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with tm.assertRaises(ValueError): + df.loc["j":"k","cats"] = ["c","c"] + + #### ix ##### + ################ + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.ix["j",0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current level set + def f(): + df = orig.copy() + df.ix["j",0] = "c" + self.assertRaises(ValueError, f) + + # - assign a complete row (mixed values) -> exp_single_row + df = orig.copy() + df.ix["j",:] = ["b",2] + tm.assert_frame_equal(df, exp_single_row) + + # - assign a complete row (mixed values) not in level set + def f(): + df = orig.copy() + df.ix["j",:] = ["c",2] + self.assertRaises(ValueError, f) + + # - assign multiple rows (mixed values) -> exp_multi_row + df = orig.copy() + df.ix["j":"k",:] = [["b",2],["b",2]] + tm.assert_frame_equal(df, exp_multi_row) + + def f(): + df = orig.copy() + df.ix["j":"k",:] = [["c",2],["c",2]] + self.assertRaises(ValueError, f) + + # - assign a part of a column with dtype == categorical -> exp_parts_cats_col + df = orig.copy() + df.ix["j":"k",0] = pd.Categorical(["b","b"], levels=["a","b"]) + tm.assert_frame_equal(df, exp_parts_cats_col) + + with tm.assertRaises(ValueError): + # different levels -> not sure if this should fail or pass + df = orig.copy() + df.ix["j":"k",0] = pd.Categorical(["b","b"], levels=["a","b","c"]) + + with tm.assertRaises(ValueError): + # different values + df = orig.copy() + df.ix["j":"k",0] = pd.Categorical(["c","c"], levels=["a","b","c"]) + + # - assign a part of a column with dtype != categorical -> exp_parts_cats_col + df = orig.copy() + df.ix["j":"k",0] = ["b","b"] + tm.assert_frame_equal(df, exp_parts_cats_col) + + with tm.assertRaises(ValueError): + df.ix["j":"k",0] = ["c","c"] + + # iat + df = orig.copy() + df.iat[2,0] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current level set + def f(): + df = orig.copy() + df.iat[2,0] = "c" + self.assertRaises(ValueError, f) + + # at + # - assign a single value -> exp_single_cats_value + df = orig.copy() + df.at["j","cats"] = "b" + tm.assert_frame_equal(df, exp_single_cats_value) + + # - assign a single value not in the current level set + def f(): + df = orig.copy() + df.at["j","cats"] = "c" + self.assertRaises(ValueError, f) + + # fancy indexing + catsf = pd.Categorical(["a","a","c","c","a","a","a"], levels=["a","b","c"]) + idxf = pd.Index(["h","i","j","k","l","m","n"]) + valuesf = [1,1,3,3,1,1,1] + df = pd.DataFrame({"cats":catsf,"values":valuesf}, index=idxf) + + exp_fancy = exp_multi_row.copy() + exp_fancy["cats"].cat.levels = ["a","b","c"] + + df[df["cats"] == "c"] = ["b",2] + tm.assert_frame_equal(df, exp_multi_row) + + # set_value + df = orig.copy() + df.set_value("j","cats", "b") + tm.assert_frame_equal(df, exp_single_cats_value) + + def f(): + df = orig.copy() + df.set_value("j","cats", "c") + self.assertRaises(ValueError, f) + + # Assigning a Category to parts of a int/... column uses the values of the Catgorical + df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]}) + exp = pd.DataFrame({"a":[1,"b","b",1,1], "b":["a","a","b","b","a"]}) + df.loc[1:2,"a"] = pd.Categorical(["b","b"], levels=["a","b"]) + df.loc[2:3,"b"] = pd.Categorical(["b","b"], levels=["a","b"]) + tm.assert_frame_equal(df, exp) + + + def test_concat(self): + cat = pd.Categorical(["a","b"], levels=["a","b"]) + vals = [1,2] + df = pd.DataFrame({"cats":cat, "vals":vals}) + cat2 = pd.Categorical(["a","b","a","b"], levels=["a","b"]) + vals2 = [1,2,1,2] + exp = pd.DataFrame({"cats":cat2, "vals":vals2}, index=pd.Index([0, 1, 0, 1])) + + res = pd.concat([df,df]) + tm.assert_frame_equal(exp, res) + + # Concat should raise if the two categoricals do not have the same levels + cat3 = pd.Categorical(["a","b"], levels=["a","b","c"]) + vals3 = [1,2] + df_wrong_levels = pd.DataFrame({"cats":cat3, "vals":vals3}) + + def f(): + pd.concat([df,df_wrong_levels]) + self.assertRaises(ValueError, f) + + def test_append(self): + cat = pd.Categorical(["a","b"], levels=["a","b"]) + vals = [1,2] + df = pd.DataFrame({"cats":cat, "vals":vals}) + cat2 = pd.Categorical(["a","b","a","b"], levels=["a","b"]) + vals2 = [1,2,1,2] + exp = pd.DataFrame({"cats":cat2, "vals":vals2}, index=pd.Index([0, 1, 0, 1])) + + res = df.append(df) + tm.assert_frame_equal(exp, res) + + # Concat should raise if the two categoricals do not have the same levels + cat3 = pd.Categorical(["a","b"], levels=["a","b","c"]) + vals3 = [1,2] + df_wrong_levels = pd.DataFrame({"cats":cat3, "vals":vals3}) + + def f(): + df.append(df_wrong_levels) + self.assertRaises(ValueError, f) + + def test_na_actions(self): + + cat = pd.Categorical([1,2,3,np.nan], levels=[1,2,3]) + vals = ["a","b",np.nan,"d"] + df = pd.DataFrame({"cats":cat, "vals":vals}) + cat2 = pd.Categorical([1,2,3,3], levels=[1,2,3]) + vals2 = ["a","b","b","d"] + df_exp_fill = pd.DataFrame({"cats":cat2, "vals":vals2}) + cat3 = pd.Categorical([1,2,3], levels=[1,2,3]) + vals3 = ["a","b",np.nan] + df_exp_drop_cats = pd.DataFrame({"cats":cat3, "vals":vals3}) + cat4 = pd.Categorical([1,2], levels=[1,2,3]) + vals4 = ["a","b"] + df_exp_drop_all = pd.DataFrame({"cats":cat4, "vals":vals4}) + + # fillna + res = df.fillna(value={"cats":3, "vals":"b"}) + tm.assert_frame_equal(res, df_exp_fill) + + def f(): + df.fillna(value={"cats":4, "vals":"c"}) + self.assertRaises(ValueError, f) + + res = df.fillna(method='pad') + tm.assert_frame_equal(res, df_exp_fill) + + res = df.dropna(subset=["cats"]) + tm.assert_frame_equal(res, df_exp_drop_cats) + + res = df.dropna() + tm.assert_frame_equal(res, df_exp_drop_all) + + def test_astype_to_other(self): + + s = self.cat['value_group'] + expected = s + tm.assert_series_equal(s.astype('category'),expected) + tm.assert_series_equal(s.astype(com.CategoricalDtype()),expected) + self.assertRaises(ValueError, lambda : s.astype('float64')) + + cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) + exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + tm.assert_series_equal(cat.astype('str'), exp) + s2 = Series(Categorical.from_array(['1', '2', '3', '4'])) + exp2 = Series([1,2,3,4]).astype(int) + tm.assert_series_equal(s2.astype('int') , exp2) + + # object don't sort correctly, so just compare that we have the same values + def cmp(a,b): + tm.assert_almost_equal(np.sort(np.unique(a)),np.sort(np.unique(b))) + expected = Series(np.array(s.values),name='value_group') + cmp(s.astype('object'),expected) + cmp(s.astype(np.object_),expected) + + # array conversion + tm.assert_almost_equal(np.array(s),np.array(s.values)) + + def test_numeric_like_ops(self): + + # numeric ops should not succeed + for op in ['__add__','__sub__','__mul__','__truediv__']: + self.assertRaises(TypeError, lambda : getattr(self.cat,op)(self.cat)) + + # reduction ops should not succeed (unless specifically defined, e.g. min/max) + s = self.cat['value_group'] + for op in ['kurt','skew','var','std','mean','sum','median']: + self.assertRaises(TypeError, lambda : getattr(s,op)(numeric_only=False)) + + # mad technically works because it takes always the numeric data + + # numpy ops + s = pd.Series(pd.Categorical([1,2,3,4])) + self.assertRaises(TypeError, lambda : np.sum(s)) + + # numeric ops on a Series + for op in ['__add__','__sub__','__mul__','__truediv__']: + self.assertRaises(TypeError, lambda : getattr(s,op)(2)) + + # invalid ufunc + self.assertRaises(TypeError, lambda : np.log(s)) + + def test_io_hdf(self): + from pandas.io.tests.test_pytables import safe_remove + from pandas.io.pytables import read_hdf + + hdf_file = 'test.h5' + + try: + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], levels=['a','b','c','d'])) + # FIXME: AttributeError: 'Categorical' object has no attribute 'T' + s.to_hdf(hdf_file, "series_alone") + s2 = read_hdf(hdf_file, "series_alone") + tm.assert_series_equal(s, s2) + df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]}) + df.to_hdf(hdf_file, "frame_alone") + df2 = read_hdf(hdf_file, "frame_alone") + tm.assert_frame_equal(df, df2) + # Ok, this doesn't work yet + # FIXME: TypeError: cannot pass a where specification when reading from a Fixed format store. this store must be selected in its entirety + #result = read_hdf(hdf_file, "frame_alone", where = ['index>2']) + #tm.assert_frame_equal(df[df.index>2],result) + + finally: + safe_remove(hdf_file) + + def test_io_csv(self): + + # CSV should result in the same output as when one would add a normal Series/DataFrame + from pandas.compat import StringIO + + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) + s2 = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) + res = StringIO() + s.to_csv(res) + exp = StringIO() + s2.to_csv(exp) + self.assertEqual(res.getvalue(), exp.getvalue()) + + df = DataFrame({"s":s}) + df2 = DataFrame({"s":s2}) + res = StringIO() + # FIXME: IndexError: too many indices + #df.to_csv(res) + #exp = StringIO() + #df2.to_csv(exp) + #self.assertEqual(res.getvalue(), exp.getvalue()) + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - # '--with-coverage', '--cover-package=pandas.core'], - exit=False) + # '--with-coverage', '--cover-package=pandas.core'] + exit=False) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 434591a86d0c4..be589af2ae848 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2903,7 +2903,7 @@ def test_groupby_categorical(self): levels = ['foo', 'bar', 'baz', 'qux'] labels = np.random.randint(0, 4, size=100) - cats = Categorical(labels, levels, name='myfactor') + cats = Categorical(labels, levels, name='myfactor', fastpath=True) data = DataFrame(np.random.randn(100, 4)) @@ -2919,7 +2919,7 @@ def test_groupby_categorical(self): grouped = data.groupby(cats) desc_result = grouped.describe() - idx = cats.labels.argsort() + idx = cats.codes.argsort() ord_labels = np.asarray(cats).take(idx) ord_data = data.take(idx) expected = ord_data.groupby(ord_labels, sort=False).describe() @@ -3050,19 +3050,28 @@ def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical(labels, [0, 1, 2]) + cats = Categorical(labels, [0, 1, 2], fastpath=True) result = data.groupby(cats).mean() exp = data.groupby(labels).mean() assert_series_equal(result, exp) labels = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical(labels, [0, 1, 2, 3]) + cats = Categorical(labels, [0, 1, 2, 3], fastpath=True) result = data.groupby(cats).mean() exp = data.groupby(labels).mean().reindex(cats.levels) assert_series_equal(result, exp) + cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], levels=["a","b","c","d"]) + data = DataFrame({"a":[1,1,1,2,2,2,3,4,5], "b":cats}) + + result = data.groupby("b").mean() + result = result["a"].values + exp = np.array([1,2,4,np.nan]) + self.assert_numpy_array_equivalent(result, exp) + + def test_groupby_first_datetime64(self): df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) df[1] = df[1].view('M8[ns]') diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 6fb88eb5597a9..a8486beb57042 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1842,6 +1842,11 @@ def test_from_arrays(self): result = MultiIndex.from_arrays(arrays) self.assertEqual(list(result), list(self.index)) + # infer correctly + result = MultiIndex.from_arrays([[pd.NaT, Timestamp('20130101')], ['a', 'b']]) + self.assertTrue(result.levels[0].equals(Index([Timestamp('20130101')]))) + self.assertTrue(result.levels[1].equals(Index(['a','b']))) + def test_from_product(self): first = ['foo', 'bar', 'buz'] second = ['a', 'b', 'c'] @@ -1907,7 +1912,7 @@ def test_get_level_values_na(self): expected = [np.nan, np.nan, np.nan] assert_array_equal(values.values.astype(float), expected) values = index.get_level_values(1) - expected = ['a', np.nan, 1] + expected = np.array(['a', np.nan, 1],dtype=object) assert_array_equal(values.values, expected) if not _np_version_under1p7: diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index d08f7e1d547c8..1ae6ceb7ae2b4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -472,14 +472,11 @@ def test_constructor_generator(self): assert_series_equal(result, exp) def test_constructor_categorical(self): - cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c']) - res = Series(cat) - exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'}) - assert_series_equal(res, exp) - + cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'], fastpath=True) cat.name = 'foo' res = Series(cat) self.assertEqual(res.name, cat.name) + self.assertTrue(res.values.equals(cat)) def test_constructor_maskedarray(self): data = ma.masked_all((3,), dtype=float) diff --git a/pandas/tools/merge.py b/pandas/tools/merge.py index d17e2e2dcb12b..ee594ef031e82 100644 --- a/pandas/tools/merge.py +++ b/pandas/tools/merge.py @@ -1029,7 +1029,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None): else: factor = Categorical.from_array(concat_index) levels.append(factor.levels) - label_list.append(factor.labels) + label_list.append(factor.codes) if len(names) == len(levels): names = list(names) diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py index 78c8201f0bcca..7390a4b11095b 100644 --- a/pandas/tools/tests/test_tile.py +++ b/pandas/tools/tests/test_tile.py @@ -27,25 +27,25 @@ def test_simple(self): def test_bins(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1]) result, bins = cut(data, 3, retbins=True) - assert_equal(result.labels, [0, 0, 0, 1, 2, 0]) + assert_equal(result.codes, [0, 0, 0, 1, 2, 0]) assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) - assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 0]) + assert_equal(result.codes, [0, 0, 0, 2, 3, 0, 0]) assert_almost_equal(bins, [0.1905, 2.575, 4.95, 7.325, 9.7]) def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) - assert_equal(result.labels, [0, 0, 0, 2, 3, 0, 1]) + assert_equal(result.codes, [0, 0, 0, 2, 3, 0, 1]) assert_almost_equal(bins, [0.2, 2.575, 4.95, 7.325, 9.7095]) def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) - assert_equal(result.labels, [0, 0, 0, 1, 2, 0]) + assert_equal(result.codes, [0, 0, 0, 1, 2, 0]) assert_almost_equal(bins, [0.1905, 3.36666667, 6.53333333, 9.7]) def test_bins_not_monotonic(self): @@ -160,7 +160,7 @@ def test_cut_out_of_bounds(self): result = cut(arr, [-1, 0, 1]) - mask = result.labels == -1 + mask = result.codes == -1 ex_mask = (arr < -1) | (arr > 1) self.assert_numpy_array_equal(mask, ex_mask) diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py index c2512ba2b4b38..b28f7c89606de 100644 --- a/pandas/tools/tile.py +++ b/pandas/tools/tile.py @@ -189,7 +189,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, levels = np.asarray(levels, dtype=object) np.putmask(ids, na_mask, 0) - fac = Categorical(ids - 1, levels, name=name) + fac = Categorical(ids - 1, levels, name=name, fastpath=True) else: fac = ids - 1 if has_nas: