Source code for avatar_yaml.models.parameters

from dataclasses import dataclass
from enum import Enum, StrEnum
from typing import Any, List, Literal, Optional

from avatar_yaml.models.common import Metadata, ModelKind
from avatar_yaml.yaml_utils import to_yaml


[docs] @dataclass(frozen=False) class AvatarizationProcessorParameters: """Base class for all avatarization processor parameters. Subclass this to define parameters for a specific server-side processor that runs within the avatarization pipeline. """
[docs] @dataclass(frozen=False) class InterRecordRangeDifferenceParameters(AvatarizationProcessorParameters): """Parameters for inter-record range difference processor. The processor transforms start/end column pairs into internal representation. This can lead to better semantic avatarization. The transformation is transparent to the user - input and output have the same column structure at the end. Records are automatically sorted by the target_start_variable for processing. """ id_variable: str target_start_variable: str target_end_variable: str def __post_init__(self): if self.target_start_variable == self.target_end_variable: raise ValueError( f"target_start_variable and target_end_variable must be different, " f"got '{self.target_start_variable}' for both" )
[docs] @dataclass(frozen=False) class RelativeDifferenceParameters(AvatarizationProcessorParameters): """Parameters for relative difference processor. The processor transforms a numeric variable into a difference relative to the sum of other variables. This can lead to better mathematical relation retention between correlated variables. The transformation is transparent to the user - input and output have the same column structure at the end. Arguments --------- target: variable to transform references: the variables of reference Keyword Arguments ----------------- scaling_unit: divide difference by factor to handle unit variation. Eg. if scaling_unit=1000, a difference in meters will be expressed in kilometers. """ target: str references: List[str] scaling_unit: Optional[int] = None def __post_init__(self): if self.scaling_unit is not None and self.scaling_unit <= 0: raise ValueError( f"scaling_unit must be strictly positive when provided, got {self.scaling_unit}" )
[docs] class AugmentationStrategy(StrEnum): minority = "minority" not_majority = "not_majority"
@dataclass(frozen=False) class DataAugmentationParameters: augmentation_strategy: float | AugmentationStrategy | dict[str, float] target_column: str | None should_anonymize_original_table: bool | None = True @dataclass(frozen=False) class AvatarizationParameters: k: int ncp: int | None = None use_categorical_reduction: bool | None = None column_weights: dict[str, float] | None = None exclude_variables: dict[str, Any] | None = None imputation: dict[str, Any] | None = None data_augmentation: DataAugmentationParameters | None = None avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None @dataclass(frozen=False) class TimeSeriesParameters: projection: dict[str, Any] | None = None alignment: dict[str, Any] | None = None @dataclass(frozen=False) class AvatarizationOpenDPParameters: epsilon: float | None = None preprocess_budget_ratio: float | None = 0.3 ncp: int | None = None use_categorical_reduction: bool | None = None column_weights: dict[str, float] | None = None exclude_variables: dict[str, Any] | None = None imputation: dict[str, Any] | None = None data_augmentation: DataAugmentationParameters | None = None avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None # Keep old name for backward compatibility AvatarizationDPParameters = AvatarizationOpenDPParameters @dataclass(frozen=False) class AvatarizationFastDPParameters: epsilon: float | None = None mechanism: Literal["gaussian", "gmm", "histogram"] | None = "gaussian" gmm_n_components: int | None = 5 histogram_n_bins: int | None = 10 bounds_percentile: float | None = 5.0 ncp: int | None = None use_categorical_reduction: bool | None = None column_weights: dict[str, float] | None = None exclude_variables: dict[str, Any] | None = None imputation: dict[str, Any] | None = None data_augmentation: DataAugmentationParameters | None = None avatarization_processors_parameters: list[AvatarizationProcessorParameters] | None = None
[docs] class AlignmentMethod(str, Enum): SPECIFIED = "specified" MAX = "max" MIN = "min" MEAN = "mean"
[docs] class ProjectionType(str, Enum): FPCA = "fpca" FLATTEN = "flatten"
[docs] class ImputeMethod(str, Enum): KNN = "knn" MODE = "mode" MEDIAN = "median" MEAN = "mean" FAST_KNN = "fast_knn"
[docs] class ExcludeVariablesMethod(str, Enum): """The method to exclude column.""" ROW_ORDER = "row_order" """SENSITIVE The excluded column will be linked to the original row order. This is a violation of privacy.""" COORDINATE_SIMILARITY = "coordinate_similarity" """The excluded column will be linked by individual similarity."""
class ReportType(str, Enum): BASIC = "basic" PIA = "pia" class ReportLanguage(str, Enum): EN = "en" FR = "fr" class OutputFormat(str, Enum): PDF = "pdf" DOCX = "docx" @dataclass(frozen=False) class SignalMetricsParameters: ncp: int | None = None use_categorical_reduction: bool | None = None imputation: dict[str, Any] | None = None column_weights: dict[str, float] | None = None exclude_variables: dict[str, Any] | None = None @dataclass(frozen=False) class PrivacyMetricsParameters: ncp: int | None = None use_categorical_reduction: bool | None = None known_variables: list[str] | None = None target: str | None = None quantile_threshold: int | None = None imputation: dict[str, Any] | None = None exclude_variables: dict[str, Any] | None = None column_weights: dict[str, float] | None = None @dataclass(frozen=True) class Results: volume: str | None = None path: str | None = None format: str | None = None name_template: str | None = None max_distribution_plots: int | None = None @dataclass(frozen=True) class ReportParametersSpec: report_type: str = ReportType.BASIC.value output_format: str = OutputFormat.PDF.value language: str = ReportLanguage.EN.value results: Results | None = None @dataclass(frozen=True) class Report: kind: ModelKind metadata: Metadata spec: ReportParametersSpec def get_report_parameters( metadata: Metadata, report_type: ReportType = ReportType.BASIC, results: Results | None = None, output_format: OutputFormat = OutputFormat.PDF, language: ReportLanguage = ReportLanguage.EN, ) -> str: spec = ReportParametersSpec( report_type=report_type.value, output_format=output_format.value, language=language.value, results=results, ) report = Report( kind=ModelKind.REPORT, metadata=metadata, spec=spec, ) return to_yaml(report) @dataclass(frozen=True) class ParametersSpec: schema: str avatarization: dict[str, AvatarizationParameters] | None = None avatarization_open_dp: dict[str, AvatarizationOpenDPParameters] | None = None avatarization_fast_dp: dict[str, AvatarizationFastDPParameters] | None = None avatarization_ref: str | None = None time_series: dict[str, TimeSeriesParameters] | None = None time_series_ref: str | None = None privacy_metrics: dict[str, PrivacyMetricsParameters] | None = None signal_metrics: dict[str, SignalMetricsParameters] | None = None results: Results | None = None seed: int | None = None @dataclass(frozen=True) class Parameters: kind: ModelKind metadata: Metadata spec: ParametersSpec def get_avatarization_parameters( metadata: Metadata, schema_name: str, avatarization: dict[str, AvatarizationParameters] | None = None, time_series: dict[str, TimeSeriesParameters] | None = None, avatarization_open_dp: dict[str, AvatarizationOpenDPParameters] | None = None, avatarization_fast_dp: dict[str, AvatarizationFastDPParameters] | None = None, seed: int | None = None, results=Results(volume="local-temp-results"), ) -> str: if ( not avatarization and not time_series and not avatarization_open_dp and not avatarization_fast_dp ): raise ValueError( "Expected at least one of avatarization, avatarization_open_dp, " "avatarization_fast_dp, or time_series" ) spec = ParametersSpec( seed=seed, schema=schema_name, avatarization=avatarization, time_series=time_series, avatarization_open_dp=avatarization_open_dp, avatarization_fast_dp=avatarization_fast_dp, results=results, ) params = Parameters( kind=ModelKind.AVATARIZATION_PARAMETERS, metadata=metadata, spec=spec, ) return to_yaml(params) def get_privacy_metrics_parameters( metadata: Metadata, schema_name: str, privacy_metrics: dict[str, PrivacyMetricsParameters] | None = None, time_series: dict[str, TimeSeriesParameters] | None = None, seed: int | None = None, avatarization_ref: str | None = None, results: Results | None = None, ) -> str: spec = ParametersSpec( seed=seed, schema=schema_name, privacy_metrics=privacy_metrics, time_series=time_series, avatarization_ref=avatarization_ref, results=results, ) params = Parameters( kind=ModelKind.PRIVACY_METRICS_PARAMETERS, metadata=metadata, spec=spec, ) return to_yaml(params) def get_signal_metrics_parameters( metadata: Metadata, schema_name: str, signal_metrics: dict[str, SignalMetricsParameters] | None = None, time_series: dict[str, TimeSeriesParameters] | None = None, seed: int | None = None, avatarization_ref: str | None = None, results: Results | None = None, ) -> str: spec = ParametersSpec( seed=seed, schema=schema_name, signal_metrics=signal_metrics, time_series=time_series, avatarization_ref=avatarization_ref, results=results, ) params = Parameters( kind=ModelKind.SIGNAL_METRICS_PARAMETERS, metadata=metadata, spec=spec, ) return to_yaml(params)