ENH: first hack at sliding window apply_along_axis in Cython

dacoex · Nov 13, 2011 · 3241692 · 3241692
1 parent 9ea758c
commit 3241692
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,10 +3,9 @@
 build
 dist
 MANIFEST
+*.c
 *.so
 *.pyd
-pandas/src/tseries.c
-pandas/src/sparse.c
 pandas/version.py
 doc/source/generated
 doc/source/_static

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2142,7 +2142,7 @@ def _shift_indexer(self, periods):
     #----------------------------------------------------------------------
     # Function application
 
-    def apply(self, func, axis=0, broadcast=False):
+    def apply(self, func, axis=0, broadcast=False, raw=False):
         """
         Applies function along input axis of DataFrame. Objects passed to
         functions are Series objects having index either the DataFrame's index
@@ -2158,6 +2158,11 @@ def apply(self, func, axis=0, broadcast=False):
         broadcast : bool, default False
             For aggregation functions, return object of same size with values
             propagated
+        raw : boolean, default False
+            If False, convert each row or column into a Series. If raw=True the
+            passed function will receive ndarray objects instead. If you are
+            just applying a NumPy reduction function this will achieve much
+            better performance
 
         Examples
         --------
@@ -2182,10 +2187,23 @@ def apply(self, func, axis=0, broadcast=False):
                                      columns=self.columns, copy=False)
         else:
             if not broadcast:
-                return self._apply_standard(func, axis)
+                if raw:
+                    return self._apply_raw(func, axis)
+                else:
+                    return self._apply_standard(func, axis)
             else:
                 return self._apply_broadcast(func, axis)
 
+    def _apply_raw(self, func, axis):
+        result = np.apply_along_axis(func, axis, self.values)
+
+        # TODO: mixed type case
+        if result.ndim == 2:
+            return DataFrame(result, index=self.index,
+                             columns=self.columns)
+        else:
+            return Series(result, index=self._get_agg_axis(axis))
+
     def _apply_standard(self, func, axis, ignore_failures=False):
         if axis == 0:
             series_gen = ((c, self[c]) for c in self.columns)

diff --git a/pandas/src/sandbox.pyx b/pandas/src/sandbox.pyx
@@ -0,0 +1,48 @@
+from numpy cimport *
+import numpy as np
+
+import_array()
+
+cdef class ArrayCruncher:
+
+    cdef:
+        ndarray arr
+        object f
+        bint raw
+        Py_ssize_t N, K
+
+    def __init__(self, arr, f, axis=0, raw=True):
+        self.arr = arr
+        self.f = f
+        self.raw = raw
+        self.N, self.K = arr.shape
+
+    def reduce(self):
+        cdef:
+            char* dummy_buf
+            ndarray arr, result, chunk
+            Py_ssize_t i, increment
+            flatiter it
+
+        if not self.arr.flags.c_contiguous:
+            arr = self.arr.copy('C')
+        else:
+            arr = self.arr
+
+        increment = self.K * self.arr.dtype.itemsize
+        chunk = np.empty(self.K, dtype=arr.dtype)
+        result = np.empty(self.N, dtype=arr.dtype)
+        it = <flatiter> PyArray_IterNew(result)
+
+        dummy_buf = chunk.data
+        chunk.data = arr.data
+
+        for i in range(self.N):
+            PyArray_SETITEM(result, PyArray_ITER_DATA(it), self.f(chunk))
+            chunk.data = chunk.data + increment
+            PyArray_ITER_NEXT(it)
+
+        # so we don't free the wrong memory
+        chunk.data = dummy_buf
+
+        return result
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -2587,6 +2587,16 @@ def test_apply_broadcast(self):
         for idx in broadcasted.index:
             self.assert_((broadcasted.xs(idx) == agged[idx]).all())
 
+    def test_apply_raw(self):
+        result0 = self.frame.apply(np.mean, raw=True)
+        result1 = self.frame.apply(np.mean, axis=1, raw=True)
+
+        expected0 = self.frame.apply(lambda x: x.values.mean())
+        expected1 = self.frame.apply(lambda x: x.values.mean(), axis=1)
+
+        assert_series_equal(result0, expected0)
+        assert_series_equal(result1, expected1)
+
     def test_apply_axis1(self):
         d = self.frame.index[0]
         tapplied = self.frame.apply(np.mean, axis=1)

diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -1100,6 +1100,10 @@ def test_apply(self):
         import math
         assert_series_equal(self.ts.apply(math.exp), np.exp(self.ts))
 
+        # does not return Series
+        result = self.ts.apply(lambda x: x.values * 2)
+        assert_series_equal(result, self.ts * 2)
+
     def test_align(self):
         def _check_align(a, b, how='left'):
             aa, ab = a.align(b, join=how)

diff --git a/setup.py b/setup.py
@@ -298,9 +298,11 @@ def srcpath(name=None, suffix='.pyx', subdir='src'):
                        sources=[srcpath('engines', suffix=suffix)],
                        include_dirs=[np.get_include()])
 
-extensions = [tseries_ext,
-              engines_ext,
-              sparse_ext]
+sandbox_ext = Extension('pandas._sandbox',
+                        sources=[srcpath('sandbox', suffix=suffix)],
+                        include_dirs=[np.get_include()])
+
+extensions = [tseries_ext, engines_ext, sparse_ext, sandbox_ext]
 
 # if _have_setuptools:
 #     setuptools_args["test_suite"] = "nose.collector"