Source code for avatars.processors.expected_mean

import warnings
from typing import List, Optional

import pandas as pd


[docs] class ExpectedMeanProcessor: """Processor to force values to have similar mean to original data. Means and standard deviations are computed for groups of variables and the processor ensures that the transformed data has similar mean and std than in the original data for each group. Care should be taken when using this processor as it only targets enhancement of unimodal utility. This may occur at the expense of multi-modal utility and privacy. Arguments --------- target_variables: variables to transform Keyword Arguments ----------------- groupby_variables: variables to use to group values in different distributions same_std: Set to True to force the variables to transform to have the same standard deviation as the reference data. default: False. Examples -------- >>> import numpy as np >>> df = pd.DataFrame(np.array(([1, 2, 3], [4, 5, 6], [4, 5, 6], [1, 2, 3])), ... columns=['one', 'two', 'three']) >>> df = df.astype('int') >>> processor = ExpectedMeanProcessor(target_variables = ['one']) >>> processed = processor.preprocess(df) The processor forces your synthetic dataset to have the same mean as the original. >>> avatar = pd.DataFrame(np.array(([3, 2, 3], [3, 5, 6], [8, 5, 6], [8, 2, 3])), ... columns=['one', 'two', 'three']) >>> avatar.one.mean() 5.5 >>> avatar = processor.postprocess(df, avatar) >>> avatar.one.mean() 2.5 You can also force the mean by category using ```groupby_variables``` >>> df = pd.DataFrame( ... { ... "variable_1": [11, 24, 23.5, 12], ... "variable_2": ["red", "blue", "blue", "red"], ... } ... ) >>> df variable_1 variable_2 0 11.0 red 1 24.0 blue 2 23.5 blue 3 12.0 red >>> df.groupby("variable_2").mean() ... # doctest: +NORMALIZE_WHITESPACE variable_1 variable_2 blue 23.75 red 11.50 >>> processor = ExpectedMeanProcessor( ... target_variables = ['variable_1'], groupby_variables= ['variable_2'], ... ) >>> processor.preprocess(df) variable_1 variable_2 0 11.0 red 1 24.0 blue 2 23.5 blue 3 12.0 red >>> avatar = pd.DataFrame( ... { ... "variable_1": [12, 13.5, 23.5, 22], ... "variable_2": ["red", "red", "blue", "blue"], ... } ... ) >>> avatar variable_1 variable_2 0 12.0 red 1 13.5 red 2 23.5 blue 3 22.0 blue >>> avatar.groupby("variable_2").mean() ... # doctest: +NORMALIZE_WHITESPACE variable_1 variable_2 blue 22.75 red 12.75 >>> avatar = processor.postprocess(df, avatar) >>> avatar variable_1 variable_2 0 10.75 red 1 12.25 red 2 24.50 blue 3 23.00 blue >>> avatar.groupby("variable_2").mean() ... # doctest: +NORMALIZE_WHITESPACE variable_1 variable_2 blue 23.75 red 11.50 """ def __init__( self, target_variables: List[str], *, groupby_variables: Optional[List[str]] = None, same_std: bool = False, ): # Variable added temporarily when no groupby is used. It is a column with only one modality # that can be used to perform a groupby in order to get the expected mean and std on all # records. self.nogroup_name = "___NOGROUPVAR___" self.nogroup_value = "__NOGROUPVAL__" if groupby_variables is None: self.groupby_variables = [self.nogroup_name] self.is_nogroup = True else: self.groupby_variables = groupby_variables self.is_nogroup = False self.target_variables = target_variables self.same_std = same_std
[docs] def preprocess(self, df: pd.DataFrame) -> pd.DataFrame: """Compute the reference mean and standard deviations. Arguments --------- df: reference dataframe Returns ------- df: the unaltered reference dataframe. """ working = df.copy() if self.is_nogroup: working[self.nogroup_name] = [self.nogroup_value for i in range(len(working))] cols = self.groupby_variables + self.target_variables self.properties_df = _get_distribution_data( df=working[cols], target_variables=self.target_variables, groupby_variables=self.groupby_variables, ) return working
[docs] def postprocess(self, source: pd.DataFrame, dest: pd.DataFrame) -> pd.DataFrame: """Force the data to have the reference mean. Arguments --------- source: not used dest: dataframe to transform Returns ------- dest: a dataframe with the transformed target columns """ original_cols = dest.columns if self.is_nogroup: # if no groupby, create a one-modality temporary variable dest[self.nogroup_name] = [self.nogroup_value] * len(dest) cols = self.groupby_variables + self.target_variables current_properties_df = _get_distribution_data( df=dest[cols], target_variables=self.target_variables, groupby_variables=self.groupby_variables, ) current_properties_df = current_properties_df.rename( columns={f"{col}mean": f"{col}mean_current" for col in self.target_variables} ) current_properties_df = current_properties_df.rename( columns={f"{col}std": f"{col}std_current" for col in self.target_variables} ) dest = dest.merge( current_properties_df, left_on=self.groupby_variables, right_on=self.groupby_variables, ) dest = dest.merge( self.properties_df, left_on=self.groupby_variables, right_on=self.groupby_variables, ) for col in self.target_variables: same_std_tmp = self.same_std if self.same_std and 0 in current_properties_df[f"{col}std_current"].values: warnings.warn( f"""target variable {col} has a std of 0. The same standard deviation cannot be guaranteed for this variable. """ ) same_std_tmp = False if same_std_tmp: dest[col] = dest[f"{col}mean"] + (dest[col] - dest[f"{col}mean_current"]) * ( dest[f"{col}std"] / dest[f"{col}std_current"] ) else: dest[col] = dest[f"{col}mean"] + (dest[col] - dest[f"{col}mean_current"]) dest = dest[original_cols] return dest
def _get_distribution_data( df: pd.DataFrame, target_variables: List[str], groupby_variables: List[str], ) -> pd.DataFrame: """Get mean and standard deviation for given variables and groupby. Arguments --------- df: data on which to compute statistics target_variables: list of columns on which to compute statistics groupby_variables: list of columns to aggregate on Returns ------- stats_df: statistics dataframe. """ # Perform transformations stats_df = df.groupby(groupby_variables, dropna=False).agg(["mean", "std"]).reset_index() # Flatten and set new aggregate variable names stats_df.columns = pd.Index(["".join(a) for a in stats_df.columns.to_flat_index()]) return stats_df