Source code for avatars.processors.perturbation
from typing import Dict, Optional
import numpy as np
import pandas as pd
from toolz.dicttoolz import valfilter
from avatars.lib.split_columns_types import NUMERIC_DTYPES
[docs]
class PerturbationProcessor:
"""Processor to reduce the difference between originals and avatars.
Specifies the perturbation level of specified variables, 0 means no perturbation.
(default: ``np.ones(df.shape[1])``)
Arguments
---------
perturbation_level:
variables and perturbation level
Keyword Arguments
-----------------
seed:
A seed to initialize the BitGenerator.
Examples
--------
>>> import numpy as np
>>> df = pd.DataFrame(np.zeros(3), columns=["column"], dtype="float")
>>> df
column
0 0.0
1 0.0
2 0.0
>>> processor = PerturbationProcessor(perturbation_level={"column": 0.3}, seed=1)
>>> processor.preprocess(df)
column
0 0.0
1 0.0
2 0.0
>>> avatar = pd.DataFrame(np.ones(3), columns=["column"], dtype="float")
>>> avatar
column
0 1.0
1 1.0
2 1.0
The post process reduces the gap between df and avatar
>>> processor.postprocess(df, avatar)
column
0 0.3
1 0.3
2 0.3
"""
def __init__(
self,
perturbation_level: Optional[Dict[str, float]] = None,
*,
seed: Optional[int] = None,
):
self.perturbation_level = perturbation_level
self.generator = np.random.default_rng(seed)
if self.perturbation_level:
self.perturbated_items = valfilter(lambda level: level != 1.0, self.perturbation_level)
[docs]
def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
"""Preprocess is doing nothing."""
return df
[docs]
def postprocess(self, source: pd.DataFrame, dest: pd.DataFrame) -> pd.DataFrame:
"""Force to reduce the difference between originals and avatars."""
if not self.perturbation_level:
return dest
missing_keys = set(self.perturbation_level.keys()).difference(source.columns)
if missing_keys:
raise ValueError(
"perturbation_level",
f"variables {missing_keys} cannot be found in the dataframe",
)
for column, values in self.perturbated_items.items():
if dest[column].dtypes in NUMERIC_DTYPES:
# for continuous data we apply a percentage of the difference
dest[column] = source[column] + ((dest[column] - source[column]) * values)
else:
# for categorical data we apply a random choice weighted by perturbation level
indices = self.generator.choice(
[0, 1], source.shape[0], p=[1 - values, values], replace=True
)
# Choose either to take the sample from source or from dest
choices = np.choose(
indices,
[
source[column].to_numpy().ravel(),
dest[column].to_numpy().ravel(),
],
)
dest[column] = choices
return dest