Source code for avatar_yaml.models.schema

from dataclasses import dataclass
from enum import StrEnum
from typing import Literal, Union

from avatar_yaml.models.common import Metadata, ModelKind


[docs] class ColumnType(StrEnum): INT = "int" BOOL = "bool" CATEGORY = "category" NUMERIC = "float" DATETIME = "datetime" DATETIME_TZ = "datetime_tz"
@dataclass(frozen=True) class TableDataInfo: volume: str | None = None file: str | None = None auto: bool | None = None class PiiType(StrEnum): """Category of personally identifiable information (PII) to generate. Used with :class:`FakeDataStrategy` to select the kind of realistic fake data to produce. The generated values are locale-aware (default ``en_US``). """ EMAIL = "EMAIL" """A syntactically valid email address (e.g. ``john.doe@example.com``).""" FIRST_NAME = "FIRST_NAME" """A given/first name (e.g. ``Alice``).""" LAST_NAME = "LAST_NAME" """A family/last name (e.g. ``Smith``).""" FULL_NAME = "FULL_NAME" """A full personal name combining first and last name (e.g. ``Alice Smith``).""" PHONE = "PHONE" """A phone number in a locale-appropriate format (e.g. ``+1-800-555-0100``).""" SSN = "SSN" """A Social Security Number formatted string (e.g. ``123-45-6789``). Use only when the source data contains SSN-like identifiers.""" ADDRESS = "ADDRESS" """A multi-line postal address (e.g. ``123 Main St, Springfield, IL 62701``).""" FREE_TEXT = "FREE_TEXT" """A paragraph of random lorem-ipsum-style text. Use for unstructured text columns (notes, comments, descriptions) that must be replaced wholesale.""" class SpecificIdLetterCase(StrEnum): """Controls the case of letters produced by the ``?`` placeholder in :class:`SpecificIdStrategy` patterns. """ UPPER = "upper" """All generated letters are upper-case (e.g. ``A``, ``B``, ``Z``).""" LOWER = "lower" """All generated letters are lower-case (e.g. ``a``, ``b``, ``z``).""" BOTH = "both" """Letters are drawn from the full mixed-case alphabet (default).""" @dataclass(frozen=True) class FakeDataStrategy: """Replace PII values with realistic fake data of the same type. The generated values are locale-aware and structurally valid (e.g. an email replacement is a real-looking email address). This strategy is ideal when you need pseudonymized data that still looks natural to downstream consumers. Example Python:: FakeDataStrategy(pii_type=PiiType.EMAIL) Attributes: pii_type: The category of PII to generate. See :class:`PiiType` for the full list of supported types. consistent: When ``True`` (default), the same source value always maps to the same fake value within a pipeline run — i.e. if ``alice@corp.com`` appears on three rows it will be replaced with the same fake email on all three rows. Set to ``False`` to generate a fresh independent value for every row (useful when uniqueness matters more than cross-row consistency). high_variability: When ``False`` (default), a pool of pre-generated fake values is sampled for each row (~70× faster). Because rows draw from that shared pool, distinct source values can collide and receive the same fake value. Set to ``True`` for fully independent per-row generation — slower but collision-free. """ pii_type: PiiType consistent: bool = True high_variability: bool = False kind: Literal["FAKER"] = "FAKER" @dataclass(frozen=True) class HashSha256Strategy: """Replace each value with its SHA-256 hex digest (deterministic, one-way). The hash is irreversible and consistent: the same source value always produces the same 64-character hex string, across runs and pipeline instances. No ``consistent`` flag is needed — SHA-256 is inherently deterministic. .. note:: If the source column contains low-entropy values (e.g. integers or short codes), a brute-force dictionary attack on the hash is feasible. Prefer :class:`Uuid4Strategy` or :class:`FakeDataStrategy` in those cases. Example Python:: HashSha256Strategy() """ kind: Literal["HASH_SHA256"] = "HASH_SHA256" @dataclass(frozen=True) class Uuid4Strategy: """Replace each unique value with a randomly generated UUID (version 4). UUIDs are opaque, globally unique, and carry no information about the original value. Use this strategy for primary/foreign keys or any identifier where structural realism is not required. Example Python:: Uuid4Strategy() Uuid4Strategy(consistent=False) # fresh UUID per row Attributes: consistent: When ``True`` (default), the same source value always maps to the same UUID within a run, preserving referential integrity across tables. Set to ``False`` to generate a new UUID for every row independently. """ consistent: bool = True kind: Literal["UUID4"] = "UUID4" @dataclass(frozen=True) class ConstantStrategy: """Replace every value in the column with a single fixed string. All rows receive the same replacement value regardless of their original content. This is the simplest strategy and is useful when the column must be fully suppressed or redacted. Example Python:: ConstantStrategy(value="REDACTED") ConstantStrategy(value="***") Attributes: value: The string that replaces every value in the column. """ value: str kind: Literal["CONSTANT"] = "CONSTANT" @dataclass(frozen=True) class IntegerStrategy: """Map each unique source value to a unique pseudonymous integer. The mapping is randomized (not sequential) so that the original sort order is not preserved. Useful for numeric identifiers (e.g. customer IDs, account numbers) when downstream code expects an integer type. Example Python:: IntegerStrategy() IntegerStrategy(consistent=False) # independent integer per row Attributes: consistent: When ``True`` (default), the same source value always maps to the same integer within a run, preserving referential integrity. Set to ``False`` to assign an independent integer to every row. """ consistent: bool = True kind: Literal["INTEGER"] = "INTEGER" @dataclass(frozen=True) class SpecificIdStrategy: """Generate structured identifiers from a user-defined pattern. Patterns combine literal characters with placeholders and optional references to other (already-pseudonymized) columns. **Placeholders** +-----------+----------------------------------------------+ | ``?`` | Random letter (case set by ``letter_case``) | +-----------+----------------------------------------------+ | ``#`` | Random digit (0–9) | +-----------+----------------------------------------------+ | ``^`` | Random alphanumeric character (a–z, 0–9) | +-----------+----------------------------------------------+ | ``{{col}}``| Value of another column (already | | | pseudonymized) for the same row | +-----------+----------------------------------------------+ Prefix any placeholder with a single backslash in the pattern to include it literally (e.g. ``\\#`` outputs ``#``, not a digit; in a Python string literal write ``"\\#"``). Example Python:: SpecificIdStrategy(pattern="EMP-####") SpecificIdStrategy(pattern="{{department}}-???##", letter_case=SpecificIdLetterCase.UPPER) SpecificIdStrategy(pattern="USR-^^^^", consistent=False) Attributes: pattern: The format string defining the structure of the generated ID. Must be a non-empty string. Combine literal text, placeholders (``?``, ``#``, ``^``), and column references (``{{col_name}}``). letter_case: Controls the letter case for ``?`` placeholders. Defaults to :attr:`SpecificIdLetterCase.BOTH` (mixed case). consistent: When ``True`` (default), the same source value always maps to the same generated ID within a run. Set to ``False`` to produce a fresh ID for every row. """ pattern: str letter_case: SpecificIdLetterCase = SpecificIdLetterCase.BOTH consistent: bool = True kind: Literal["SPECIFIC_ID"] = "SPECIFIC_ID" PseudonymizationStrategy = Union[ FakeDataStrategy, HashSha256Strategy, Uuid4Strategy, ConstantStrategy, IntegerStrategy, SpecificIdStrategy, ] # Mapping from kind string to strategy constructor for YAML parsing _STRATEGY_KIND_MAP: dict[ Literal["FAKER", "HASH_SHA256", "UUID4", "CONSTANT", "INTEGER", "SPECIFIC_ID"], type[PseudonymizationStrategy], ] = { "FAKER": FakeDataStrategy, "HASH_SHA256": HashSha256Strategy, "UUID4": Uuid4Strategy, "CONSTANT": ConstantStrategy, "INTEGER": IntegerStrategy, "SPECIFIC_ID": SpecificIdStrategy, } def parse_pseudonymization(raw: dict) -> PseudonymizationStrategy: """Parse a pseudonymization dict (from YAML) into a strategy object.""" kind = raw["kind"] cls = _STRATEGY_KIND_MAP.get(kind) if cls is None: msg = f"Unknown pseudonymization kind: {kind}" raise ValueError(msg) kwargs = {k: v for k, v in raw.items() if k != "kind"} if "pii_type" in kwargs: kwargs["pii_type"] = PiiType(kwargs["pii_type"]) if "letter_case" in kwargs: kwargs["letter_case"] = SpecificIdLetterCase(kwargs["letter_case"]) return cls(**kwargs) @dataclass(frozen=True) class ColumnInfo: field: str type: ColumnType | None = None value_type: str | None = None identifier: bool | None = None primary_key: bool | None = None time_series_time: bool | None = None pseudonymization: PseudonymizationStrategy | None = None drop: bool | None = None
[docs] @dataclass(frozen=True) class LinkMethod(StrEnum): """Available assignment methods to link a child to its parent table after the anonymization.""" LINEAR_SUM_ASSIGNMENT = "linear_sum_assignment" """Assign using the linear sum assignment algorithm. This method is a good privacy and utility trade-off. The algorithm consumes lots of resources. """ MINIMUM_DISTANCE_ASSIGNMENT = "minimum_distance_assignment" """Assign using the minimum distance assignment algorithm. This method assigns the closest child to the parent. It is an acceptable privacy and utility trade-off. This algorithm consumes less resources than the linear sum assignment.""" SENSITIVE_ORIGINAL_ORDER_ASSIGNMENT = "sensitive_original_order_assignment" """Assign the child to the parent using the original order. WARNING!!! This method is a HIGH PRIVACY BREACH as it keeps the original order to assign the child to the parent. This method isn't recommended for privacy reasons but consumes less resources than the other methods.""" TIME_SERIES = "time_series" """Specific assignment method for time series data. It is used to link time series data to the parent table."""
@dataclass(frozen=True) class TableLinkInfoSpec: """Destination part of a table link.""" table: str field: str @dataclass(frozen=True) class TableLinkInfo: """A link from a field to a field in another table.""" field: str to: TableLinkInfoSpec method: LinkMethod @dataclass(frozen=False) class TableInfo: name: str data: TableDataInfo | None = None individual_level: bool | None = None avatars_data: TableDataInfo | None = None columns: list[ColumnInfo] | None = None links: list[TableLinkInfo] | None = None @dataclass(frozen=True) class SchemaSpec: tables: list[TableInfo] schema_ref: str | None = None @dataclass(frozen=True) class Schema: kind: ModelKind metadata: Metadata spec: SchemaSpec def get_schema(name: str, tables: list[TableInfo], schema_ref: str | None = None) -> Schema: return Schema( kind=ModelKind.SCHEMA, metadata=Metadata(name=name), spec=SchemaSpec( tables=tables, schema_ref=schema_ref, ), )