from dataclasses import dataclass
from enum import Enum, StrEnum
from typing import Any, List, Literal, Optional
from avatar_yaml.models.common import Metadata, ModelKind
from avatar_yaml.yaml_utils import to_yaml
@dataclass(frozen=False)
class AvatarizationProcessorParameters:
"""Base class for all avatarization processor parameters.
Subclass this to define parameters for a specific server-side processor that runs within the
avatarization pipeline.
"""
@dataclass(frozen=False)
class InterRecordRangeDifferenceParameters(AvatarizationProcessorParameters):
"""Parameters for inter-record range difference processor.
The processor transforms start/end column pairs into internal representation.
This can lead to better semantic avatarization. The transformation
is transparent to the user - input and output have the same column structure at the end.
Records are automatically sorted by the target_start_variable for processing.
"""
id_variable: str
target_start_variable: str
target_end_variable: str
def __post_init__(self):
if self.target_start_variable == self.target_end_variable:
raise ValueError(
f"target_start_variable and target_end_variable must be different, "
f"got '{self.target_start_variable}' for both"
)
@dataclass(frozen=False)
class RelativeDifferenceParameters(AvatarizationProcessorParameters):
"""Parameters for relative difference processor.
The processor transforms a numeric variable into a difference relative to the sum
of other variables. This can lead to better mathematical relation retention
between correlated variables. The transformation is transparent to the user -
input and output have the same column structure at the end.
Arguments
---------
target:
variable to transform
references:
the variables of reference
Keyword Arguments
-----------------
scaling_unit:
divide difference by factor to handle unit variation.
Eg. if scaling_unit=1000, a difference in meters will be expressed in kilometers.
"""
target: str
references: List[str]
scaling_unit: Optional[int] = None
def __post_init__(self):
if self.scaling_unit is not None and self.scaling_unit <= 0:
raise ValueError(
f"scaling_unit must be strictly positive when provided, got {self.scaling_unit}"
)
class AugmentationStrategy(StrEnum):
minority = "minority"
not_majority = "not_majority"
@dataclass(frozen=False)
class DataAugmentationParameters:
augmentation_strategy: float | AugmentationStrategy | dict[str, float]
target_column: str | None
should_anonymize_original_table: bool | None = True
@dataclass(frozen=False)
class AvatarizationParameters:
k: int
ncp: int | None = None
use_categorical_reduction: bool | None = None
column_weights: dict[str, float] | None = None
exclude_variables: dict[str, Any] | None = None
imputation: dict[str, Any] | None = None
data_augmentation: DataAugmentationParameters | None = None
avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None
@dataclass(frozen=False)
class TimeSeriesParameters:
projection: dict[str, Any] | None = None
alignment: dict[str, Any] | None = None
@dataclass(frozen=False)
class AvatarizationOpenDPParameters:
epsilon: float | None = None
preprocess_budget_ratio: float | None = 0.3
ncp: int | None = None
use_categorical_reduction: bool | None = None
column_weights: dict[str, float] | None = None
exclude_variables: dict[str, Any] | None = None
imputation: dict[str, Any] | None = None
data_augmentation: DataAugmentationParameters | None = None
avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None
# Keep old name for backward compatibility
AvatarizationDPParameters = AvatarizationOpenDPParameters
@dataclass(frozen=False)
class AvatarizationFastDPParameters:
epsilon: float | None = None
mechanism: Literal["gaussian", "gmm", "histogram"] | None = "gaussian"
gmm_n_components: int | None = 5
histogram_n_bins: int | None = 10
bounds_percentile: float | None = 5.0
ncp: int | None = None
use_categorical_reduction: bool | None = None
column_weights: dict[str, float] | None = None
exclude_variables: dict[str, Any] | None = None
imputation: dict[str, Any] | None = None
data_augmentation: DataAugmentationParameters | None = None
avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None
[docs]
class AlignmentMethod(str, Enum):
SPECIFIED = "specified"
MAX = "max"
MIN = "min"
MEAN = "mean"
[docs]
class ProjectionType(str, Enum):
FPCA = "fpca"
FLATTEN = "flatten"
[docs]
class ImputeMethod(str, Enum):
KNN = "knn"
MODE = "mode"
MEDIAN = "median"
MEAN = "mean"
FAST_KNN = "fast_knn"
[docs]
class ExcludeVariablesMethod(str, Enum):
"""The method to exclude column."""
ROW_ORDER = "row_order"
"""SENSITIVE The excluded column will be linked to the original row order.
This is a violation of privacy."""
COORDINATE_SIMILARITY = "coordinate_similarity"
"""The excluded column will be linked by individual similarity."""
class ReportType(str, Enum):
BASIC = "basic"
PIA = "pia"
class ReportLanguage(str, Enum):
EN = "en"
FR = "fr"
class OutputFormat(str, Enum):
PDF = "pdf"
DOCX = "docx"
@dataclass(frozen=False)
class SignalMetricsParameters:
ncp: int | None = None
use_categorical_reduction: bool | None = None
imputation: dict[str, Any] | None = None
column_weights: dict[str, float] | None = None
exclude_variables: dict[str, Any] | None = None
@dataclass(frozen=False)
class PrivacyMetricsParameters:
ncp: int | None = None
use_categorical_reduction: bool | None = None
known_variables: list[str] | None = None
target: str | None = None
quantile_threshold: int | None = None
imputation: dict[str, Any] | None = None
exclude_variables: dict[str, Any] | None = None
column_weights: dict[str, float] | None = None
@dataclass(frozen=True)
class Results:
volume: str | None = None
path: str | None = None
format: str | None = None
name_template: str | None = None
max_distribution_plots: int | None = None
@dataclass(frozen=True)
class ReportParametersSpec:
report_type: str = ReportType.BASIC.value
output_format: str = OutputFormat.PDF.value
language: str = ReportLanguage.EN.value
results: Results | None = None
@dataclass(frozen=True)
class Report:
kind: ModelKind
metadata: Metadata
spec: ReportParametersSpec
def get_report_parameters(
metadata: Metadata,
report_type: ReportType = ReportType.BASIC,
results: Results | None = None,
output_format: OutputFormat = OutputFormat.PDF,
language: ReportLanguage = ReportLanguage.EN,
) -> str:
spec = ReportParametersSpec(
report_type=report_type.value,
output_format=output_format.value,
language=language.value,
results=results,
)
report = Report(
kind=ModelKind.REPORT,
metadata=metadata,
spec=spec,
)
return to_yaml(report)
@dataclass(frozen=True)
class ParametersSpec:
schema: str
avatarization: dict[str, AvatarizationParameters] | None = None
avatarization_open_dp: dict[str, AvatarizationOpenDPParameters] | None = None
avatarization_fast_dp: dict[str, AvatarizationFastDPParameters] | None = None
avatarization_ref: str | None = None
time_series: dict[str, TimeSeriesParameters] | None = None
time_series_ref: str | None = None
privacy_metrics: dict[str, PrivacyMetricsParameters] | None = None
signal_metrics: dict[str, SignalMetricsParameters] | None = None
results: Results | None = None
seed: int | None = None
@dataclass(frozen=True)
class Parameters:
kind: ModelKind
metadata: Metadata
spec: ParametersSpec
def get_avatarization_parameters(
metadata: Metadata,
schema_name: str,
avatarization: dict[str, AvatarizationParameters] | None = None,
time_series: dict[str, TimeSeriesParameters] | None = None,
avatarization_open_dp: dict[str, AvatarizationOpenDPParameters] | None = None,
avatarization_fast_dp: dict[str, AvatarizationFastDPParameters] | None = None,
seed: int | None = None,
results=Results(volume="local-temp-results"),
) -> str:
if (
not avatarization
and not time_series
and not avatarization_open_dp
and not avatarization_fast_dp
):
raise ValueError(
"Expected at least one of avatarization, avatarization_open_dp, "
"avatarization_fast_dp, or time_series"
)
spec = ParametersSpec(
seed=seed,
schema=schema_name,
avatarization=avatarization,
time_series=time_series,
avatarization_open_dp=avatarization_open_dp,
avatarization_fast_dp=avatarization_fast_dp,
results=results,
)
params = Parameters(
kind=ModelKind.AVATARIZATION_PARAMETERS,
metadata=metadata,
spec=spec,
)
return to_yaml(params)
def get_privacy_metrics_parameters(
metadata: Metadata,
schema_name: str,
privacy_metrics: dict[str, PrivacyMetricsParameters] | None = None,
time_series: dict[str, TimeSeriesParameters] | None = None,
seed: int | None = None,
avatarization_ref: str | None = None,
results: Results | None = None,
) -> str:
spec = ParametersSpec(
seed=seed,
schema=schema_name,
privacy_metrics=privacy_metrics,
time_series=time_series,
avatarization_ref=avatarization_ref,
results=results,
)
params = Parameters(
kind=ModelKind.PRIVACY_METRICS_PARAMETERS,
metadata=metadata,
spec=spec,
)
return to_yaml(params)
def get_signal_metrics_parameters(
metadata: Metadata,
schema_name: str,
signal_metrics: dict[str, SignalMetricsParameters] | None = None,
time_series: dict[str, TimeSeriesParameters] | None = None,
seed: int | None = None,
avatarization_ref: str | None = None,
results: Results | None = None,
) -> str:
spec = ParametersSpec(
seed=seed,
schema=schema_name,
signal_metrics=signal_metrics,
time_series=time_series,
avatarization_ref=avatarization_ref,
results=results,
)
params = Parameters(
kind=ModelKind.SIGNAL_METRICS_PARAMETERS,
metadata=metadata,
spec=spec,
)
return to_yaml(params)