Source code for avatars.processors.proportions

from typing import List

import numpy as np
import pandas as pd

from avatars.lib.saferound import saferound



[docs]
class ProportionProcessor:
    """Processor to express numeric variables as a proportion of another variable.

    By this transformation, we keep the addition and subtraction relations
    such as variable_1 = variable_2 + variable_3.

    Arguments
    ---------
        variable_names:
            variables to transform
        reference:
            the variable of reference

    Keyword Arguments
    -----------------
        sum_to_one:
            set to True to ensure the sum of the variables sum to 1 once transformed.
            default: True
        decimal_count:
            the number of decimals postprocessed variables should have

    Examples
    --------
    >>> df =  pd.DataFrame(
    ...        {
    ...            "variable_1": [100, 10],
    ...            "variable_2": [10, 10],
    ...            "variable_3": [90, 30],
    ...        }
    ...    )
    >>> processor = ProportionProcessor(
    ...    variable_names=["variable_2", "variable_3"],
    ...    reference="variable_1",
    ... )
    >>> processor.preprocess(df=df)
       variable_1  variable_2  variable_3
    0         100        0.10        0.90
    1          10        0.25        0.75

    This processor allows you to transform some variable as a proportion of another variable.
    By default, the processor enforces the proportion of `variable_names` to be equal to 1.

    >>> avatar = pd.DataFrame(
    ...        {
    ...            "variable_1": [60, 15],
    ...            "variable_2": [0.15, 0.88],
    ...            "variable_3": [0.18, 0.77],
    ...        }
    ...    )
    >>> avatar
       variable_1  variable_2  variable_3
    0          60        0.15        0.18
    1          15        0.88        0.77

    Then the postprocess allows you to get the original variable unit.

    >>> processor.postprocess(df, avatar)
       variable_1  variable_2  variable_3
    0          60        27.3        32.7
    1          15         8.0         7.0

    By this, we keep the mathematical relation variable_1 = variable_2 + variable_3

    with `sum_to_one=False`

    >>> processor = ProportionProcessor(
    ...    variable_names=["variable_2", "variable_3"],
    ...    reference="variable_1",
    ...    sum_to_one=False,
    ... )
    >>> processor.preprocess(df=df)
       variable_1  variable_2  variable_3
    0         100         0.1         0.9
    1          10         1.0         3.0
    >>> avatar = pd.DataFrame(
    ...        {
    ...            "variable_1": [60, 15],
    ...            "variable_2": [0.15, 0.88],
    ...            "variable_3": [1.5, 2.8],
    ...        }
    ...    )
    >>> avatar
       variable_1  variable_2  variable_3
    0          60        0.15         1.5
    1          15        0.88         2.8
    >>> processor.postprocess(df, avatar)
       variable_1  variable_2  variable_3
    0          60         9.0        90.0
    1          15        13.2        42.0
    """

    def __init__(
        self,
        variable_names: List[str],
        reference: str,
        *,
        sum_to_one: bool = True,
        decimal_count: int = 1,
    ):
        self.variable_names = variable_names
        self.reference = reference
        self.sum_to_one = sum_to_one
        self.decimal_count = decimal_count


[docs]
    def preprocess(self, df: pd.DataFrame) -> pd.DataFrame:
        """Transform numeric variables into proportion of another variable.

        If some values for the variables to transform are set to nan, they will be transformed
        into nan and will be considered as a 0% proportion of the reference when transforming
        values of other variables.

        Arguments
        ---------
            df: dataframe to transform

        Returns
        -------
            DataFrame: a dataframe with the transformed version of wanted columns
        """
        col_order = df.columns
        df = df.copy()
        if self.reference not in df.columns.values:
            raise ValueError(
                "variable_name",
                f"variable {self.reference} cannot be found in the dataframe variables",
            )
        if self.sum_to_one:
            sub_df = df[self.variable_names].div(df[self.variable_names].sum(axis=1), axis=0)
        else:
            sub_df = df[self.variable_names].div(df[self.reference], axis=0)

        # Ensure that target variables that were set to zero remain at zero
        for variable in self.variable_names:
            zero_indices = df[df[variable] == 0].index
            if len(zero_indices) > 0:
                sub_df.loc[zero_indices, variable] = 0

        df = df.drop(columns=self.variable_names)
        df = pd.concat([df, sub_df], axis=1)[col_order]
        return df



[docs]
    def postprocess(self, source: pd.DataFrame, dest: pd.DataFrame) -> pd.DataFrame:
        """Transform proportion of another variable into an absolute numeric value.

        Arguments
        ---------
            source: not used
            dest: dataframe to transform

        Returns
        -------
            DataFrame: a dataframe with the transformed version of wanted columns
        """
        col_order = dest.columns
        dest = dest.copy()
        if self.reference not in dest.columns.values:
            raise ValueError(
                "variable_name",
                f"variable {self.reference} cannot be found in the dataframe variables",
            )
        if self.sum_to_one:
            sub_df = (
                dest[self.variable_names]
                .mul(dest[self.reference], axis=0)
                .div(dest[self.variable_names].sum(axis=1), axis=0)
            )
        else:
            sub_df = dest[self.variable_names].mul(dest[self.reference], axis=0)
        dest = dest.drop(columns=self.variable_names)
        dest = pd.concat([dest, sub_df], axis=1)[col_order]

        # Perform rounding of the postprocess variable to the expected number of decimals.
        # We use saferounding here to force the sum of rounded variables to remain unchanged.
        if not self.sum_to_one:
            return dest

        for i, row in enumerate(dest[self.variable_names].values):
            if not np.any(np.isnan(row)):
                dest.loc[i, self.variable_names] = saferound(row.tolist(), self.decimal_count)

        return dest