core.pandas: allow specifying schema; add tests

2021-02-14 20:23:27 +00:00 · 2021-02-14 20:23:27 +00:00 · 746c3da0ca
commit 746c3da0ca
parent d77ab92d86
2 changed files with 38 additions and 4 deletions
--- a/my/core/pandas.py
+++ b/my/core/pandas.py
@ -5,7 +5,7 @@ Various pandas helpers and convenience functions
 # NOTE: this file is meant to be importable without Pandas installed
 from datetime import datetime
 from pprint import pformat
-from typing import Optional, TYPE_CHECKING, Any, Iterable
+from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, List
 from . import warnings, Res
 from .common import LazyLogger

@ -101,16 +101,49 @@ from .error import error_to_json
 error_to_row = error_to_json # todo deprecate?


+# mm. https://github.com/python/mypy/issues/8564
+# no type for dataclass?
+Schema = Any
+
+def _as_columns(s: Schema) -> List[str]:
+    import dataclasses as D
+    if D.is_dataclass(s):
+        return [f.name for f in D.fields(s)]
+    # else must be NamedTuple??
+    return list(getattr(s, '_fields'))
+
+
 # todo add proper types
@check_dataframe
-def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT:
+def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT:
+    # todo warn if schema isn't specified?
    # ok nice supports dataframe/NT natively
    # https://github.com/pandas-dev/pandas/pull/27999
    #    but it dispatches dataclass based on the first entry...
    #    https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
    # same for NamedTuple -- seems that it takes whatever schema the first NT has
    # so we need to convert each individually... sigh
-    # TODO just add tests for it?
    from .common import to_jsons
    import pandas as pd
-    return pd.DataFrame(to_jsons(it))
+    columns = None if schema is None else _as_columns(schema)
+    return pd.DataFrame(to_jsons(it), columns=columns)
+
+
+def test_as_dataframe() -> None:
+    import pytest
+    it = (dict(i=i, s=f'str{i}') for i in range(10))
+    with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings:
+        df = as_dataframe(it)
+        # todo test other error col policies
+    assert list(df.columns) == ['i', 's', 'error']
+
+    assert len(as_dataframe([])) == 0
+
+    from dataclasses import dataclass
+    @dataclass
+    class X:
+        x: int
+
+    # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
+    df = as_dataframe([], schema=X)
+    assert list(df.columns) == ['x', 'error']
--- a/tests/core/test_pandas.py
+++ b/tests/core/test_pandas.py
@ -0,0 +1 @@
+from my.core.pandas import *