Source code for avatar_yaml.models.parameters

from dataclasses import dataclass
from enum import Enum, StrEnum
from typing import Any, List, Literal, Optional

from avatar_yaml.models.common import Metadata, ModelKind
from avatar_yaml.yaml_utils import to_yaml


@dataclass(frozen=False)
class AvatarizationProcessorParameters:
    """Base class for all avatarization processor parameters.

    Subclass this to define parameters for a specific server-side processor that runs within the
    avatarization pipeline.
    """


@dataclass(frozen=False)
class InterRecordRangeDifferenceParameters(AvatarizationProcessorParameters):
    """Parameters for inter-record range difference processor.

    The processor transforms start/end column pairs into internal representation.
    This can lead to better semantic avatarization. The transformation
    is transparent to the user - input and output have the same column structure at the end.

    Records are automatically sorted by the target_start_variable for processing.
    """

    id_variable: str
    target_start_variable: str
    target_end_variable: str

    def __post_init__(self):
        if self.target_start_variable == self.target_end_variable:
            raise ValueError(
                f"target_start_variable and target_end_variable must be different, "
                f"got '{self.target_start_variable}' for both"
            )


@dataclass(frozen=False)
class RelativeDifferenceParameters(AvatarizationProcessorParameters):
    """Parameters for relative difference processor.

    The processor transforms a numeric variable into a difference relative to the sum
    of other variables. This can lead to better mathematical relation retention
    between correlated variables. The transformation is transparent to the user -
    input and output have the same column structure at the end.

    Arguments
    ---------
        target:
            variable to transform
        references:
            the variables of reference

    Keyword Arguments
    -----------------
        scaling_unit:
            divide difference by factor to handle unit variation.
            Eg. if scaling_unit=1000, a difference in meters will be expressed in kilometers.
    """

    target: str
    references: List[str]
    scaling_unit: Optional[int] = None

    def __post_init__(self):
        if self.scaling_unit is not None and self.scaling_unit <= 0:
            raise ValueError(
                f"scaling_unit must be strictly positive when provided, got {self.scaling_unit}"
            )


class AugmentationStrategy(StrEnum):
    minority = "minority"
    not_majority = "not_majority"


@dataclass(frozen=False)
class DataAugmentationParameters:
    augmentation_strategy: float | AugmentationStrategy | dict[str, float]
    target_column: str | None
    should_anonymize_original_table: bool | None = True


@dataclass(frozen=False)
class AvatarizationParameters:
    k: int
    ncp: int | None = None
    use_categorical_reduction: bool | None = None
    column_weights: dict[str, float] | None = None
    exclude_variables: dict[str, Any] | None = None
    imputation: dict[str, Any] | None = None
    data_augmentation: DataAugmentationParameters | None = None
    avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None


@dataclass(frozen=False)
class TimeSeriesParameters:
    projection: dict[str, Any] | None = None
    alignment: dict[str, Any] | None = None


@dataclass(frozen=False)
class AvatarizationOpenDPParameters:
    epsilon: float | None = None
    preprocess_budget_ratio: float | None = 0.3
    ncp: int | None = None
    use_categorical_reduction: bool | None = None
    column_weights: dict[str, float] | None = None
    exclude_variables: dict[str, Any] | None = None
    imputation: dict[str, Any] | None = None
    data_augmentation: DataAugmentationParameters | None = None
    avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None


# Keep old name for backward compatibility
AvatarizationDPParameters = AvatarizationOpenDPParameters


@dataclass(frozen=False)
class AvatarizationFastDPParameters:
    epsilon: float | None = None
    mechanism: Literal["gaussian", "gmm", "histogram"] | None = "gaussian"
    gmm_n_components: int | None = 5
    histogram_n_bins: int | None = 10
    bounds_percentile: float | None = 5.0
    ncp: int | None = None
    use_categorical_reduction: bool | None = None
    column_weights: dict[str, float] | None = None
    exclude_variables: dict[str, Any] | None = None
    imputation: dict[str, Any] | None = None
    data_augmentation: DataAugmentationParameters | None = None
    avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None


[docs] class AlignmentMethod(str, Enum): SPECIFIED = "specified" MAX = "max" MIN = "min" MEAN = "mean"
[docs] class ProjectionType(str, Enum): FPCA = "fpca" FLATTEN = "flatten"
[docs] class ImputeMethod(str, Enum): KNN = "knn" MODE = "mode" MEDIAN = "median" MEAN = "mean" FAST_KNN = "fast_knn"
[docs] class ExcludeVariablesMethod(str, Enum): """The method to exclude column.""" ROW_ORDER = "row_order" """SENSITIVE The excluded column will be linked to the original row order. This is a violation of privacy.""" COORDINATE_SIMILARITY = "coordinate_similarity" """The excluded column will be linked by individual similarity."""
class ReportType(str, Enum): BASIC = "basic" PIA = "pia" class ReportLanguage(str, Enum): EN = "en" FR = "fr" class OutputFormat(str, Enum): PDF = "pdf" DOCX = "docx" @dataclass(frozen=False) class SignalMetricsParameters: ncp: int | None = None use_categorical_reduction: bool | None = None imputation: dict[str, Any] | None = None column_weights: dict[str, float] | None = None exclude_variables: dict[str, Any] | None = None @dataclass(frozen=False) class PrivacyMetricsParameters: ncp: int | None = None use_categorical_reduction: bool | None = None known_variables: list[str] | None = None target: str | None = None quantile_threshold: int | None = None imputation: dict[str, Any] | None = None exclude_variables: dict[str, Any] | None = None column_weights: dict[str, float] | None = None @dataclass(frozen=True) class Results: volume: str | None = None path: str | None = None format: str | None = None name_template: str | None = None max_distribution_plots: int | None = None @dataclass(frozen=True) class ReportParametersSpec: report_type: str = ReportType.BASIC.value output_format: str = OutputFormat.PDF.value language: str = ReportLanguage.EN.value results: Results | None = None @dataclass(frozen=True) class Report: kind: ModelKind metadata: Metadata spec: ReportParametersSpec def get_report_parameters( metadata: Metadata, report_type: ReportType = ReportType.BASIC, results: Results | None = None, output_format: OutputFormat = OutputFormat.PDF, language: ReportLanguage = ReportLanguage.EN, ) -> str: spec = ReportParametersSpec( report_type=report_type.value, output_format=output_format.value, language=language.value, results=results, ) report = Report( kind=ModelKind.REPORT, metadata=metadata, spec=spec, ) return to_yaml(report) @dataclass(frozen=True) class ParametersSpec: schema: str avatarization: dict[str, AvatarizationParameters] | None = None avatarization_open_dp: dict[str, AvatarizationOpenDPParameters] | None = None avatarization_fast_dp: dict[str, AvatarizationFastDPParameters] | None = None avatarization_ref: str | None = None time_series: dict[str, TimeSeriesParameters] | None = None time_series_ref: str | None = None privacy_metrics: dict[str, PrivacyMetricsParameters] | None = None signal_metrics: dict[str, SignalMetricsParameters] | None = None results: Results | None = None seed: int | None = None @dataclass(frozen=True) class Parameters: kind: ModelKind metadata: Metadata spec: ParametersSpec def get_avatarization_parameters( metadata: Metadata, schema_name: str, avatarization: dict[str, AvatarizationParameters] | None = None, time_series: dict[str, TimeSeriesParameters] | None = None, avatarization_open_dp: dict[str, AvatarizationOpenDPParameters] | None = None, avatarization_fast_dp: dict[str, AvatarizationFastDPParameters] | None = None, seed: int | None = None, results=Results(volume="local-temp-results"), ) -> str: if ( not avatarization and not time_series and not avatarization_open_dp and not avatarization_fast_dp ): raise ValueError( "Expected at least one of avatarization, avatarization_open_dp, " "avatarization_fast_dp, or time_series" ) spec = ParametersSpec( seed=seed, schema=schema_name, avatarization=avatarization, time_series=time_series, avatarization_open_dp=avatarization_open_dp, avatarization_fast_dp=avatarization_fast_dp, results=results, ) params = Parameters( kind=ModelKind.AVATARIZATION_PARAMETERS, metadata=metadata, spec=spec, ) return to_yaml(params) def get_privacy_metrics_parameters( metadata: Metadata, schema_name: str, privacy_metrics: dict[str, PrivacyMetricsParameters] | None = None, time_series: dict[str, TimeSeriesParameters] | None = None, seed: int | None = None, avatarization_ref: str | None = None, results: Results | None = None, ) -> str: spec = ParametersSpec( seed=seed, schema=schema_name, privacy_metrics=privacy_metrics, time_series=time_series, avatarization_ref=avatarization_ref, results=results, ) params = Parameters( kind=ModelKind.PRIVACY_METRICS_PARAMETERS, metadata=metadata, spec=spec, ) return to_yaml(params) def get_signal_metrics_parameters( metadata: Metadata, schema_name: str, signal_metrics: dict[str, SignalMetricsParameters] | None = None, time_series: dict[str, TimeSeriesParameters] | None = None, seed: int | None = None, avatarization_ref: str | None = None, results: Results | None = None, ) -> str: spec = ParametersSpec( seed=seed, schema=schema_name, signal_metrics=signal_metrics, time_series=time_series, avatarization_ref=avatarization_ref, results=results, ) params = Parameters( kind=ModelKind.SIGNAL_METRICS_PARAMETERS, metadata=metadata, spec=spec, ) return to_yaml(params)