[docs]classGroupModalitiesProcessor:"""Processor to group modalities in order to reduce the dataframe dimension. Use the parameter `variables` if you want to apply a custom threshold to each variable. Use the parameter `min_unique` and `threshold` if you want to apply a generic threshold. Keyword Arguments ----------------- variable_thresholds: dictionary of variables and thresholds to apply, see global_threshold below. min_unique: number of unique modalities by variable needed to be transformed. global_threshold: limit of the number of individuals in each category to rename it. new_category: new modality name (default="other"). Examples -------- >>> df = pd.DataFrame( ... { ... "variable_1": ["red", "blue", "blue", "green"], ... "variable_2": ["red", "blue", "blue", "red"], ... "variable_3": ["green", "green", "green", "green"], ... } ... ) >>> df variable_1 variable_2 variable_3 0 red red green 1 blue blue green 2 blue blue green 3 green red green >>> processor = GroupModalitiesProcessor( ... min_unique=2, ... global_threshold=1, ... new_category="other" ... ) >>> processor.preprocess(df) variable_1 variable_2 variable_3 0 other red green 1 blue blue green 2 blue blue green 3 other red green """def__init__(self,*,variable_thresholds:Optional[Dict[str,int]]=None,min_unique:Optional[int]=None,global_threshold:Optional[int]=None,new_category:str="other",):if(notmin_uniqueandglobal_threshold)or(notglobal_thresholdandmin_unique):raiseValueError(f"Expected both of (global_threshold, min_unique), got"f"{(global_threshold,min_unique)} instead.")if(notvariable_thresholdsandnotglobal_threshold)or(variable_thresholdsandglobal_threshold):raiseValueError(f"Expected variable_thresholds or (threshold, min_unique),"f"got variable_thresholds {variable_thresholds} and "f"(threshold, min_unique) {(global_threshold,min_unique)} instead.")self.variable_thresholds=variable_thresholdsself.min_unique=min_uniqueself.global_threshold=global_thresholdself.new_category=new_category
[docs]defpreprocess(self,df:pd.DataFrame)->pd.DataFrame:ifself.variable_thresholds:variables=self.variable_thresholds.keys()ifnotall(elemindf.columnsforeleminvariables):unknown_variables=[elemforeleminvariablesifelemnotindf.columns]raiseValueError("Expected valid variables in variable_thresholds",f"got {unknown_variables} instead.",)df=df.copy()ifnotself.variable_thresholds:# select columns to transform_,category_index=split_columns_types(df)categories=df.columns[category_index]# Select columns to reducecount_category=df[categories].nunique()columns_to_reduce=count_category[count_category>=self.min_unique].index.tolist()ifself.global_threshold:self.variable_thresholds={x:self.global_thresholdforxincolumns_to_reduce}# Apply the modality transformationifself.variable_thresholds:# TODO: fix me for mypycount={x:df[x].value_counts().to_dict()forxinself.variable_thresholds.keys()}correspondence={key:{k:self.new_categoryfork,vinvalue.items()ifv<=self.variable_thresholds[key]}forkey,valueincount.items()}df=df.replace(correspondence)# type: ignore[arg-type]returndf