from dataclasses import dataclass
from enum import StrEnum
from typing import Literal, Union
from avatar_yaml.models.common import Metadata, ModelKind
[docs]
class ColumnType(StrEnum):
INT = "int"
BOOL = "bool"
CATEGORY = "category"
NUMERIC = "float"
DATETIME = "datetime"
DATETIME_TZ = "datetime_tz"
@dataclass(frozen=True)
class TableDataInfo:
volume: str | None = None
file: str | None = None
auto: bool | None = None
[docs]
class PiiType(StrEnum):
"""Category of personally identifiable information (PII) to generate.
Used with :class:`FakeDataStrategy` to select the kind of realistic fake
data to produce. The generated values are locale-aware (default ``en_US``).
"""
EMAIL = "EMAIL"
"""A syntactically valid email address (e.g. ``john.doe@example.com``)."""
FIRST_NAME = "FIRST_NAME"
"""A given/first name (e.g. ``Alice``)."""
LAST_NAME = "LAST_NAME"
"""A family/last name (e.g. ``Smith``)."""
FULL_NAME = "FULL_NAME"
"""A full personal name combining first and last name (e.g. ``Alice Smith``)."""
PHONE = "PHONE"
"""A phone number in a locale-appropriate format (e.g. ``+1-800-555-0100``)."""
SSN = "SSN"
"""A Social Security Number formatted string (e.g. ``123-45-6789``).
Use only when the source data contains SSN-like identifiers."""
ADDRESS = "ADDRESS"
"""A multi-line postal address (e.g. ``123 Main St, Springfield, IL 62701``)."""
FREE_TEXT = "FREE_TEXT"
"""A paragraph of random lorem-ipsum-style text. Use for unstructured text
columns (notes, comments, descriptions) that must be replaced wholesale."""
[docs]
class SpecificIdLetterCase(StrEnum):
"""Controls the case of letters produced by the ``?`` placeholder in
:class:`SpecificIdStrategy` patterns.
"""
UPPER = "upper"
"""All generated letters are upper-case (e.g. ``A``, ``B``, ``Z``)."""
LOWER = "lower"
"""All generated letters are lower-case (e.g. ``a``, ``b``, ``z``)."""
BOTH = "both"
"""Letters are drawn from the full mixed-case alphabet (default)."""
[docs]
@dataclass(frozen=True)
class FakeDataStrategy:
"""Replace PII values with realistic fake data of the same type.
The generated values are locale-aware and structurally valid (e.g. an email
replacement is a real-looking email address). This strategy is ideal when
you need pseudonymized data that still looks natural to downstream consumers.
Example Python::
FakeDataStrategy(pii_type=PiiType.EMAIL)
Attributes:
pii_type: The category of PII to generate. See :class:`PiiType` for
the full list of supported types.
consistent: When ``True`` (default), the same source value always maps
to the same fake value within a pipeline run — i.e. if ``alice@corp.com``
appears on three rows it will be replaced with the same fake email
on all three rows. Set to ``False`` to generate a fresh independent
value for every row (useful when uniqueness matters more than
cross-row consistency).
high_variability: When ``False`` (default), a pool of pre-generated fake
values is sampled for each row (~70× faster). Because rows draw from
that shared pool, distinct source values can collide and receive the
same fake value. Set to ``True`` for fully independent per-row
generation — slower but collision-free.
"""
pii_type: PiiType
consistent: bool = True
high_variability: bool = False
kind: Literal["FAKER"] = "FAKER"
[docs]
@dataclass(frozen=True)
class HashSha256Strategy:
"""Replace each value with its SHA-256 hex digest (deterministic, one-way).
The hash is irreversible and consistent: the same source value always
produces the same 64-character hex string, across runs and pipeline
instances. No ``consistent`` flag is needed — SHA-256 is inherently
deterministic.
.. note::
If the source column contains low-entropy values (e.g. integers or
short codes), a brute-force dictionary attack on the hash is feasible.
Prefer :class:`Uuid4Strategy` or :class:`FakeDataStrategy` in those
cases.
Example Python::
HashSha256Strategy()
"""
kind: Literal["HASH_SHA256"] = "HASH_SHA256"
[docs]
@dataclass(frozen=True)
class Uuid4Strategy:
"""Replace each unique value with a randomly generated UUID (version 4).
UUIDs are opaque, globally unique, and carry no information about the
original value. Use this strategy for primary/foreign keys or any
identifier where structural realism is not required.
Example Python::
Uuid4Strategy()
Uuid4Strategy(consistent=False) # fresh UUID per row
Attributes:
consistent: When ``True`` (default), the same source value always maps
to the same UUID within a run, preserving referential integrity
across tables. Set to ``False`` to generate a new UUID for every
row independently.
"""
consistent: bool = True
kind: Literal["UUID4"] = "UUID4"
[docs]
@dataclass(frozen=True)
class ConstantStrategy:
"""Replace every value in the column with a single fixed string.
All rows receive the same replacement value regardless of their original
content. This is the simplest strategy and is useful when the column
must be fully suppressed or redacted.
Example Python::
ConstantStrategy(value="REDACTED")
ConstantStrategy(value="***")
Attributes:
value: The string that replaces every value in the column.
"""
value: str
kind: Literal["CONSTANT"] = "CONSTANT"
[docs]
@dataclass(frozen=True)
class IntegerStrategy:
"""Map each unique source value to a unique pseudonymous integer.
The mapping is randomized (not sequential) so that the original sort
order is not preserved. Useful for numeric identifiers (e.g. customer
IDs, account numbers) when downstream code expects an integer type.
Example Python::
IntegerStrategy()
IntegerStrategy(consistent=False) # independent integer per row
Attributes:
consistent: When ``True`` (default), the same source value always maps
to the same integer within a run, preserving referential integrity.
Set to ``False`` to assign an independent integer to every row.
"""
consistent: bool = True
kind: Literal["INTEGER"] = "INTEGER"
[docs]
@dataclass(frozen=True)
class SpecificIdStrategy:
"""Generate structured identifiers from a user-defined pattern.
Patterns combine literal characters with placeholders and optional
references to other (already-pseudonymized) columns.
**Placeholders**
+-----------+----------------------------------------------+
| ``?`` | Random letter (case set by ``letter_case``) |
+-----------+----------------------------------------------+
| ``#`` | Random digit (0–9) |
+-----------+----------------------------------------------+
| ``^`` | Random alphanumeric character (a–z, 0–9) |
+-----------+----------------------------------------------+
| ``{{col}}``| Value of another column (already |
| | pseudonymized) for the same row |
+-----------+----------------------------------------------+
Prefix any placeholder with a single backslash in the pattern to include it literally
(e.g. ``\\#`` outputs ``#``, not a digit; in a Python string literal write ``"\\#"``).
Example Python::
SpecificIdStrategy(pattern="EMP-####")
SpecificIdStrategy(pattern="{{department}}-???##", letter_case=SpecificIdLetterCase.UPPER)
SpecificIdStrategy(pattern="USR-^^^^", consistent=False)
Attributes:
pattern: The format string defining the structure of the generated ID.
Must be a non-empty string. Combine literal text, placeholders
(``?``, ``#``, ``^``), and column references (``{{col_name}}``).
letter_case: Controls the letter case for ``?`` placeholders.
Defaults to :attr:`SpecificIdLetterCase.BOTH` (mixed case).
consistent: When ``True`` (default), the same source value always maps
to the same generated ID within a run. Set to ``False`` to
produce a fresh ID for every row.
"""
pattern: str
letter_case: SpecificIdLetterCase = SpecificIdLetterCase.BOTH
consistent: bool = True
kind: Literal["SPECIFIC_ID"] = "SPECIFIC_ID"
PseudonymizationStrategy = Union[
FakeDataStrategy,
HashSha256Strategy,
Uuid4Strategy,
ConstantStrategy,
IntegerStrategy,
SpecificIdStrategy,
]
# Mapping from kind string to strategy constructor for YAML parsing
_STRATEGY_KIND_MAP: dict[
Literal["FAKER", "HASH_SHA256", "UUID4", "CONSTANT", "INTEGER", "SPECIFIC_ID"],
type[PseudonymizationStrategy],
] = {
"FAKER": FakeDataStrategy,
"HASH_SHA256": HashSha256Strategy,
"UUID4": Uuid4Strategy,
"CONSTANT": ConstantStrategy,
"INTEGER": IntegerStrategy,
"SPECIFIC_ID": SpecificIdStrategy,
}
def parse_pseudonymization(raw: dict) -> PseudonymizationStrategy:
"""Parse a pseudonymization dict (from YAML) into a strategy object."""
kind = raw["kind"]
cls = _STRATEGY_KIND_MAP.get(kind)
if cls is None:
msg = f"Unknown pseudonymization kind: {kind}"
raise ValueError(msg)
kwargs = {k: v for k, v in raw.items() if k != "kind"}
if "pii_type" in kwargs:
kwargs["pii_type"] = PiiType(kwargs["pii_type"])
if "letter_case" in kwargs:
kwargs["letter_case"] = SpecificIdLetterCase(kwargs["letter_case"])
return cls(**kwargs)
@dataclass(frozen=True)
class ColumnInfo:
field: str
type: ColumnType | None = None
value_type: str | None = None
identifier: bool | None = None
primary_key: bool | None = None
time_series_time: bool | None = None
pseudonymization: PseudonymizationStrategy | None = None
drop: bool | None = None
[docs]
@dataclass(frozen=True)
class LinkMethod(StrEnum):
"""Available assignment methods to link a child to its parent table after the anonymization."""
LINEAR_SUM_ASSIGNMENT = "linear_sum_assignment"
"""Assign using the linear sum assignment algorithm.
This method is a good privacy and utility trade-off. The algorithm consumes lots of resources.
"""
MINIMUM_DISTANCE_ASSIGNMENT = "minimum_distance_assignment"
"""Assign using the minimum distance assignment algorithm.
This method assigns the closest child to the parent. It is an acceptable privacy and utility
trade-off.
This algorithm consumes less resources than the linear sum assignment."""
SENSITIVE_ORIGINAL_ORDER_ASSIGNMENT = "sensitive_original_order_assignment"
"""Assign the child to the parent using the original order.
WARNING!!! This method is a HIGH PRIVACY BREACH as it keeps the original order to assign
the child to the parent.
This method isn't recommended for privacy reasons but consumes less resources than the other
methods."""
TIME_SERIES = "time_series"
"""Specific assignment method for time series data.
It is used to link time series data to the parent table."""
@dataclass(frozen=True)
class TableLinkInfoSpec:
"""Destination part of a table link."""
table: str
field: str
@dataclass(frozen=True)
class TableLinkInfo:
"""A link from a field to a field in another table."""
field: str
to: TableLinkInfoSpec
method: LinkMethod
@dataclass(frozen=False)
class TableInfo:
name: str
data: TableDataInfo | None = None
individual_level: bool | None = None
avatars_data: TableDataInfo | None = None
columns: list[ColumnInfo] | None = None
links: list[TableLinkInfo] | None = None
@dataclass(frozen=True)
class SchemaSpec:
tables: list[TableInfo]
schema_ref: str | None = None
@dataclass(frozen=True)
class Schema:
kind: ModelKind
metadata: Metadata
spec: SchemaSpec
def get_schema(name: str, tables: list[TableInfo], schema_ref: str | None = None) -> Schema:
return Schema(
kind=ModelKind.SCHEMA,
metadata=Metadata(name=name),
spec=SchemaSpec(
tables=tables,
schema_ref=schema_ref,
),
)