Source code for datawrangler.zoo.dataframe

import pandas as pd

from ..util import dataframe_like
from ..io import load_dataframe


[docs]def is_dataframe(x): """ Determine if an object (or file) is a DataFrame Parameters ---------- :param x: the object (or a file path) Returns ------- :return: True if the object is a DataFrame (or points to a file that can be loaded into Pandas as a DataFrame), and False otherwise. """ if type(x).__module__ in ['pandas.core.frame', 'modin.pandas.dataframe']: return True else: if dataframe_like(x): return True # noinspection PyBroadException try: data = load_dataframe(x) return data is not None except: return False
[docs]def is_multiindex_dataframe(x): """ Determine if an object (or file) is a MultiIndex DataFrame-- i.e., a DataFrame with a multi-level index Parameters ---------- :param x: the object (or file path) Returns ------- :return: True if the object is a MultiIndex DataFrame (or points to a file that can be loaded into Pandas as a MultiIndex DataFrame), and False otherwise. """ return is_dataframe(x) and ('indexes.multi' in type(x.index).__module__)
[docs]def wrangle_dataframe(data, return_model=False, **kwargs): """ Turn a (potentially messy) DataFrame into a (potentially cleaner) DataFrame Parameters ---------- :param data: a DataFrame, dataframe-like object, or a file path that points to a file that can be loaded into Pandas as a DataFrame :param return_model: if True, return a function for turning the ("messy") DataFrame into a "clean" DataFrame, along with the cleaned DataFrame. Otherwise (if False), just return the cleaned DataFrame. Default: False :param kwargs: passed to the DataFrame "wrangling" model (default: the constructor for pd.DataFrame) Returns ------- :return: The "wrangled" DataFrame (if return_model is False), or the DataFrame plus a "model" for cleaning DataFrames (if return_model is True). """ load_kwargs = kwargs.pop('load_kwargs', {}) data = load_dataframe(data, **load_kwargs) model = kwargs.pop('model', None) if model is None: model = {'model': pd.DataFrame, 'args': [], 'kwargs': kwargs} elif type(model) is not dict: model = {'model': model, 'args': [], 'kwargs': kwargs} wrangled = model['model'](data, *model['args'], **model['kwargs']) if return_model: return wrangled, model return wrangled