core.pandas: allow specifying schema; add tests

This commit is contained in:
Dima Gerasimov 2021-02-14 20:23:27 +00:00 committed by karlicoss
parent d77ab92d86
commit 746c3da0ca
2 changed files with 38 additions and 4 deletions

View file

@ -5,7 +5,7 @@ Various pandas helpers and convenience functions
# NOTE: this file is meant to be importable without Pandas installed # NOTE: this file is meant to be importable without Pandas installed
from datetime import datetime from datetime import datetime
from pprint import pformat from pprint import pformat
from typing import Optional, TYPE_CHECKING, Any, Iterable from typing import Optional, TYPE_CHECKING, Any, Iterable, Type, List
from . import warnings, Res from . import warnings, Res
from .common import LazyLogger from .common import LazyLogger
@ -101,16 +101,49 @@ from .error import error_to_json
error_to_row = error_to_json # todo deprecate? error_to_row = error_to_json # todo deprecate?
# mm. https://github.com/python/mypy/issues/8564
# no type for dataclass?
Schema = Any
def _as_columns(s: Schema) -> List[str]:
import dataclasses as D
if D.is_dataclass(s):
return [f.name for f in D.fields(s)]
# else must be NamedTuple??
return list(getattr(s, '_fields'))
# todo add proper types # todo add proper types
@check_dataframe @check_dataframe
def as_dataframe(it: Iterable[Res[Any]]) -> DataFrameT: def as_dataframe(it: Iterable[Res[Any]], schema: Optional[Schema]=None) -> DataFrameT:
# todo warn if schema isn't specified?
# ok nice supports dataframe/NT natively # ok nice supports dataframe/NT natively
# https://github.com/pandas-dev/pandas/pull/27999 # https://github.com/pandas-dev/pandas/pull/27999
# but it dispatches dataclass based on the first entry... # but it dispatches dataclass based on the first entry...
# https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562 # https://github.com/pandas-dev/pandas/blob/fc9fdba6592bdb5d0d1147ce4d65639acd897565/pandas/core/frame.py#L562
# same for NamedTuple -- seems that it takes whatever schema the first NT has # same for NamedTuple -- seems that it takes whatever schema the first NT has
# so we need to convert each individually... sigh # so we need to convert each individually... sigh
# TODO just add tests for it?
from .common import to_jsons from .common import to_jsons
import pandas as pd import pandas as pd
return pd.DataFrame(to_jsons(it)) columns = None if schema is None else _as_columns(schema)
return pd.DataFrame(to_jsons(it), columns=columns)
def test_as_dataframe() -> None:
import pytest
it = (dict(i=i, s=f'str{i}') for i in range(10))
with pytest.warns(UserWarning, match=r"No 'error' column") as record_warnings:
df = as_dataframe(it)
# todo test other error col policies
assert list(df.columns) == ['i', 's', 'error']
assert len(as_dataframe([])) == 0
from dataclasses import dataclass
@dataclass
class X:
x: int
# makes sense to specify the schema so the downstream program doesn't fail in case of empty iterable
df = as_dataframe([], schema=X)
assert list(df.columns) == ['x', 'error']

View file

@ -0,0 +1 @@
from my.core.pandas import *