Source code for avatars.processors.group_modalities
from typing import Dict, Optional
import pandas as pd
from avatars.lib.split_columns_types import split_columns_types
[docs]
class GroupModalitiesProcessor:
"""Processor to group modalities in order to reduce the dataframe dimension.
Use the parameter `variables` if you want to apply a custom threshold to each variable.
Use the parameter `min_unique` and `threshold` if you want to apply a generic threshold.
Keyword Arguments
-----------------
variable_thresholds:
dictionary of variables and thresholds to apply,
see global_threshold below.
min_unique:
number of unique modalities by variable needed to be transformed.
global_threshold:
limit of the number of individuals in each category to rename it.
new_category:
new modality name (default="other").
Examples
--------
>>> df = pd.DataFrame(
... {
... "variable_1": ["red", "blue", "blue", "green"],
... "variable_2": ["red", "blue", "blue", "red"],
... "variable_3": ["green", "green", "green", "green"],
... }
... )
>>> df
variable_1 variable_2 variable_3
0 red red green
1 blue blue green
2 blue blue green
3 green red green
>>> processor = GroupModalitiesProcessor(
... min_unique=2,
... global_threshold=1,
... new_category="other"
... )
>>> processor.preprocess(df)
variable_1 variable_2 variable_3
0 other red green
1 blue blue green
2 blue blue green
3 other red green
"""
def __init__(
self,
*,
variable_thresholds: Optional[Dict[str, int]] = None,
min_unique: Optional[int] = None,
global_threshold: Optional[int] = None,
new_category: str = "other",
):
if (not min_unique and global_threshold) or (not global_threshold and min_unique):
raise ValueError(
f"Expected both of (global_threshold, min_unique), got"
f"{(global_threshold, min_unique)} instead."
)
if (not variable_thresholds and not global_threshold) or (
variable_thresholds and global_threshold
):
raise ValueError(
f"Expected variable_thresholds or (threshold, min_unique),"
f"got variable_thresholds {variable_thresholds} and "
f"(threshold, min_unique) {(global_threshold, min_unique)} instead."
)
self.variable_thresholds = variable_thresholds
self.min_unique = min_unique
self.global_threshold = global_threshold
self.new_category = new_category
[docs]
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
if self.variable_thresholds:
variables = self.variable_thresholds.keys()
if not all(elem in df.columns for elem in variables):
unknown_variables = [elem for elem in variables if elem not in df.columns]
raise ValueError(
"Expected valid variables in variable_thresholds",
f"got {unknown_variables} instead.",
)
df = df.copy()
if not self.variable_thresholds:
# select columns to transform
_, category_index = split_columns_types(df)
categories = df.columns[category_index]
# Select columns to reduce
count_category = df[categories].nunique()
if self.min_unique is None:
raise ValueError(
"min_unique must be set when variable_thresholds is not provided."
)
columns_to_reduce = count_category[count_category >= self.min_unique].index.tolist()
if self.global_threshold is not None:
self.variable_thresholds = {x: self.global_threshold for x in columns_to_reduce}
# Apply the modality transformation
if self.variable_thresholds: # TODO: fix me for mypy
count = {x: df[x].value_counts().to_dict() for x in self.variable_thresholds.keys()}
correspondence = {
key: {
k: self.new_category
for k, v in value.items()
if v <= self.variable_thresholds[key]
}
for key, value in count.items()
}
df = df.replace(correspondence) # type: ignore[arg-type]
return df
[docs]
def postprocess(self, source: pd.DataFrame, dest: pd.DataFrame) -> pd.DataFrame:
return dest