Source code for avatars.processors.relative_difference

from typing import List, Optional

import pandas as pd



[docs]
class RelativeDifferenceProcessor:
    """Express numeric variables as a difference relative to the sum of other variables.

    Even if the avatarization is keeping relation and correlation,
    it will not guarantee mathematical relation retention.
    You can apply the RelativeDifferenceProcessor to retain this relation between variables.

    Arguments
    ---------
        target:
            variables to transform
        references:
            the variable of reference

    Keyword Arguments
    -----------------
        scaling_unit:
            divide difference by factor to handle unit variation.
            Eg. if scaling_unit=1000, a difference in meters will be expressed in kilometers.
        target_rename:
            target name after preprocess.
        drop_original_target:
            drop original_target. Can only be set to ``True``
            if ``target_rename`` is specified

    Examples
    --------
    >>> import numpy as np
    >>> df = pd.DataFrame(
    ...    {
    ...        "variable_1": [100, 150, 120, 100],
    ...        "variable_2": [110, 180, 130, np.nan]
    ...        }
    ...    )
    >>> processor = RelativeDifferenceProcessor(target="variable_2", references=["variable_1"])
    >>> df = processor.preprocess(df)
    >>> df
       variable_1  variable_2
    0         100        10.0
    1         150        30.0
    2         120        10.0
    3         100         NaN

    This preprocess allows you to convert some variable as a difference of other. It can useful
    when there is a relation between variables when `variable_2 >= variable_1`

    >>> avatar = pd.DataFrame(
    ...    {
    ...        "variable_1": [110, 105, 115, 107],
    ...        "variable_2": [12, np.nan, 23, 15],
    ...        }
    ...    )
    >>> avatar
       variable_1  variable_2
    0         110        12.0
    1         105         NaN
    2         115        23.0
    3         107        15.0
    >>> avatar = processor.postprocess(df, avatar)
    >>> avatar
       variable_1  variable_2
    0         110       122.0
    1         105         NaN
    2         115       138.0
    3         107       122.0

    This processor can be useful when you have a relation between three variables.
    Lets suppose you have three variable with such as:

    - age_at_t0
    - age_at_t1
    - age_at_t2

    The relation is age_at_t0 < age_at_t1 < age_at_t2, for all the individuals.

    >>> df = pd.DataFrame(
    ...    {
    ...        "age_at_t0": [20, 40, 34, 56],
    ...        "age_at_t1": [23, 46, 37, 57],
    ...        "age_at_t2": [29, 54, 39, 64],
    ...        }
    ...    )
    >>> df
       age_at_t0  age_at_t1  age_at_t2
    0         20         23         29
    1         40         46         54
    2         34         37         39
    3         56         57         64
    >>> processor_1 = RelativeDifferenceProcessor( target="age_at_t2", references=["age_at_t1"])
    >>> processor_2 = RelativeDifferenceProcessor( target="age_at_t1", references=["age_at_t0"])

    .. note::

        Be careful about the order of application of the processors

    >>> processed = processor_1.preprocess(df)
    >>> processor_2.preprocess(processed)
       age_at_t0  age_at_t1  age_at_t2
    0         20        3.0        6.0
    1         40        6.0        8.0
    2         34        3.0        2.0
    3         56        1.0        7.0

    >>> avatar = pd.DataFrame(
    ...    {
    ...        "age_at_t0": [22, 38, 34, 56],
    ...        "age_at_t1": [4.0, 5.0, 1.0, 5.0],
    ...        "age_at_t2": [5.0, 3.0, 7.0, 6.0],
    ...        }
    ...    )
    >>> avatar
       age_at_t0  age_at_t1  age_at_t2
    0         22        4.0        5.0
    1         38        5.0        3.0
    2         34        1.0        7.0
    3         56        5.0        6.0
    >>> post_avatar = processor_2.postprocess(df, avatar)
    >>> processor_1.postprocess(df, post_avatar)
       age_at_t0  age_at_t1  age_at_t2
    0         22       26.0       31.0
    1         38       43.0       46.0
    2         34       35.0       42.0
    3         56       61.0       67.0
    """

    def __init__(
        self,
        target: str,
        references: List[str],
        scaling_unit: Optional[int] = None,
        target_rename: Optional[str] = None,
        drop_original_target: Optional[bool] = False,
    ):
        self.target = target
        self.references = references
        self.scaling_unit = scaling_unit or 1
        if drop_original_target and target_rename is None:
            raise ValueError(
                "Expected drop_original_target to be False if a target_rename is None, "
                f"got {drop_original_target} instead",
            )
        self.target_rename = target_rename
        if self.target_rename is None:
            self.target_rename = self.target
        self.drop_original_target = drop_original_target


[docs]
    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform a numeric variable into a difference relative to the sum of other variables.

        Arguments
        ---------
            df: dataframe to transform

        Returns
        -------
            a dataframe with the transformed version of wanted columns
        """
        df = df.copy()

        wrong_variables = set(self.references) - set(df.columns.values)
        if wrong_variables:
            raise ValueError(
                "Expected all reference variables in dataset columns, "
                f"got {wrong_variables} instead."
            )
        if df[self.references].isnull().values.any():
            raise ValueError(
                "Expected no missing values for `references`, "
                "got column with missing values instead"
            )
        df[self.target_rename] = (
            df[self.target].sub(df[self.references].sum(axis=1))
        ) / self.scaling_unit
        if self.drop_original_target:
            df = df.drop(columns=[self.target])
        return df



[docs]
    def postprocess(self, source: pd.DataFrame, dest: pd.DataFrame) -> pd.DataFrame:
        """Transform a difference relative to the sum of variables into an absolute numeric value.

        Arguments
        ---------
            source: not used
            dest: dataframe to transform

        Returns
        -------
            a dataframe with the transformed version of wanted columns
        """
        dest = dest.copy()
        wrong_variables = set(self.references) - set(dest.columns.values)
        if wrong_variables:
            raise ValueError(
                "Expected all reference variables in dataset columns, "
                f"got {wrong_variables} instead."
            )
        dest[self.target] = (dest[self.target_rename] * self.scaling_unit).add(
            dest[self.references].sum(axis=1)
        )
        return dest