core.pandas: allow specifying schema; add tests

2021-02-14 20:23:27 +00:00 · 2021-02-14 20:23:27 +00:00 · 746c3da0ca
commit 746c3da0ca
parent d77ab92d86
2 changed files with 38 additions and 4 deletions
--- a/my/core/pandas.py
+++ b/my/core/pandas.py
@ -5,7 +5,7 @@ Various pandas helpers and convenience functions
 # NOTE: this file is meant to be importable without Pandas installed
 from datetime import datetime
 from pprint import pformat
-from typing import Optional, TYPE_CHECKING, Any, Iterable
+from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, List
 from . import warnings, Res
 from .common import LazyLogger
@ -101,16 +101,49 @@ from .error import error_to_json
 error_to_row = error_to_json # todo deprecate?
 # mm. https://github.com/python/mypy/issues/8564
 # no type for dataclass?
 Schema = Any
 def _as_columns(s: Schema) -> List[str]:
    import dataclasses as D
    if D.is_dataclass(s):
        return [f.name for f in D.fields(s)]
    # else must be NamedTuple??
    return list(getattr(s, '_fields'))
 # todo add proper types
@check_dataframe
-def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT:
+def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT:
    # todo warn if schema isn't specified?
    # ok nice supports dataframe/NT natively
    # https://github.com/pandas-dev/pandas/pull/27999
    #    but it dispatches dataclass based on the first entry...
    #    https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
    # same for NamedTuple -- seems that it takes whatever schema the first NT has
    # so we need to convert each individually... sigh
    # TODO just add tests for it?
    from .common import to_jsons
    import pandas as pd
-    return pd.DataFrame(to_jsons(it))
+    columns = None if schema is None else _as_columns(schema)
    return pd.DataFrame(to_jsons(it), columns=columns)
 def test_as_dataframe() -> None:
    import pytest
    it = (dict(i=i, s=f'str{i}') for i in range(10))
    with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings:
        df = as_dataframe(it)
        # todo test other error col policies
    assert list(df.columns) == ['i', 's', 'error']
    assert len(as_dataframe([])) == 0
    from dataclasses import dataclass
    @dataclass
    class X:
        x: int
    # makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
    df = as_dataframe([], schema=X)
    assert list(df.columns) == ['x', 'error']
--- a/tests/core/test_pandas.py
+++ b/tests/core/test_pandas.py
@ -0,0 +1 @@
 from my.core.pandas import *