Source code for avatars.processors.to_categorical
import numpy as np
import pandas as pd
from avatars.lib.continuous_threshold import get_continuous_under_threshold
[docs]
class ToCategoricalProcessor:
"""Processor to model selected numeric variables as categorical variables.
Arguments
---------
to_categorical_threshold:
threshold of the number of distinct value to consider
a continuous variable as categorical.
Keyword Arguments
-----------------
keep_continuous:
if `True`, continuous variables will be kept and
suffixed with `continuous_suffix`.
continuous_suffix:
suffix for the continuous variable created during preprocess.
category:
if `keep_continuous=True`, name of the new category, needed for some specific
avatarization cases with the use of group_modalities processor
Examples
--------
With `keep_continuous=False` it only convert the variable to object.
By this you ensure to keep all values during the avatarization.
>>> df = pd.DataFrame(
... {
... "variable_1": [1, 7, 7, 1],
... "variable_2": [1, 2, 7, 1]
... }
... )
>>> processor = ToCategoricalProcessor(to_categorical_threshold = 2)
>>> processor.preprocess(df).dtypes
variable_1 object
variable_2 int64
dtype: object
>>> avatar = pd.DataFrame(
... {
... "variable_1": [2, 1, 4, 1],
... "variable_2": [2, 1, 4, 1]
... }
... )
>>> avatar["variable_1"] = avatar["variable_1"].astype('object')
>>> avatar.dtypes
variable_1 object
variable_2 int64
dtype: object
>>> processor.postprocess(df, avatar).dtypes
variable_1 int64
variable_2 int64
dtype: object
With `keep_continuous=True`, you duplicate the variable and keep it as continuous.
This can be useful for other uses.
>>> df = pd.DataFrame(
... {
... "variable_1": [1, 7, 7, 1],
... "variable_2": [1, 2, 7, 1]
... }
... )
>>> processor = ToCategoricalProcessor(to_categorical_threshold=2, keep_continuous=True)
>>> processor.preprocess(df).dtypes
variable_1 object
variable_2 int64
variable_1__cont int64
dtype: object
"""
def __init__(
self,
to_categorical_threshold: int,
*,
keep_continuous: bool = False,
continuous_suffix: str = "__cont",
category: str = "other",
):
self.to_categorical_threshold = to_categorical_threshold
self.keep_continuous = keep_continuous
self.continuous_suffix = continuous_suffix
self.category = category
[docs]
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
"""Transform numeric variables into categorical variables.
Arguments
---------
df: dataframe to transform
Returns
-------
DataFrame: transformed dataframe
"""
df = df.copy()
self.variables = get_continuous_under_threshold(
df, threshold=self.to_categorical_threshold
)
# Perform the transformation by creating a new column cont_suffix where
# NAs are kept as NAs.
if self.keep_continuous:
self.continuous_variables = [x + self.continuous_suffix for x in self.variables]
df[self.continuous_variables] = df[self.variables]
df[self.variables] = df[self.variables].astype("object")
return df
[docs]
def postprocess(self, source: pd.DataFrame, dest: pd.DataFrame) -> pd.DataFrame:
"""Transform converted categorical variables back to numeric.
Arguments
---------
source: reference data frame
dest: data frame to transform
Returns
-------
DataFrame: transformed data frame
"""
dest = dest.copy()
# affect the continuous value to modality "category",
# needed for some specific avatarization cases with the use of group_modalities processor
if self.keep_continuous:
dest[self.variables] = np.where(
dest[self.variables] == self.category,
dest[self.continuous_variables],
dest[self.variables],
)
dest = dest.drop(columns=self.continuous_variables)
# reassign the original data type
dest[self.variables] = dest[self.variables].astype(source[self.variables].dtypes.to_dict())
return dest