Source code for avatars.constants

import io
import re
from enum import StrEnum
from typing import (
    IO,
    TYPE_CHECKING,
    Any,
    BinaryIO,
    Union,
)

import pandas as pd
from avatar_yaml.models.schema import ColumnType

if TYPE_CHECKING:
    from IPython.display import HTML  # noqa: F401

from avatars.models import JobKind, JobStatus

DEFAULT_TIMEOUT = 5

FileLike = Union[BinaryIO, IO[Any], io.IOBase]
FileLikes = list[FileLike]
VOLUME_NAME = "input"

JOB_EXECUTION_ORDER = [
    JobKind.standard,
    JobKind.signal_metrics,
    JobKind.privacy_metrics,
    JobKind.report,
]

ERROR_STATUSES = [JobStatus.parent_error, JobStatus.error, JobStatus.lost, JobStatus.orphaned]

READY_STATUSES = [JobStatus.finished, *ERROR_STATUSES]

_FAILURE_STATUSES: frozenset[JobStatus] = frozenset(
    {JobStatus.error, JobStatus.parent_error, JobStatus.lost, JobStatus.orphaned}
)
_PENDING_STATUSES: frozenset[JobStatus] = frozenset({JobStatus.pending, JobStatus.created})
_QUEUED_STATUSES: frozenset[JobStatus] = frozenset({JobStatus.queued, JobStatus.field_})



[docs]
def aggregate_job_status(statuses: list[JobStatus]) -> JobStatus:
    """Return the 'worst' status from a list — failure > pending > queued > finished.

    Parameters
    ----------
    statuses
        List of job statuses to aggregate. Returns ``JobStatus.finished`` for an
        empty list (vacuously "all done").

    Returns
    -------
    JobStatus
        The highest-priority (worst) status present in ``statuses``.
    """
    for s in statuses:
        if s in _FAILURE_STATUSES:
            return s
    for s in statuses:
        if s in _PENDING_STATUSES:
            return s
    for s in statuses:
        if s in _QUEUED_STATUSES:
            return s
    return JobStatus.finished



# For network retries
DEFAULT_NETWORK_RETRY_COUNT = 20
DEFAULT_NETWORK_RETRY_INTERVAL = 5

# For rate limit retries
DEFAULT_RATE_LIMIT_MAX_RETRIES = 3
DEFAULT_RATE_LIMIT_MIN_WAIT_SECONDS = 1.0

# For creating and polling jobs
DEFAULT_DELAY_BETWEEN_CONSECUTIVE_JOBS = 0.5
DEFAULT_POLL_INTERVAL = 5



[docs]
class Results(StrEnum):
    ADVICE = "advice"
    SHUFFLED = "shuffled"
    UNSHUFFLED = "unshuffled"
    PRIVACY_METRICS = "privacy_metrics"
    SIGNAL_METRICS = "signal_metrics"
    REPORT_IMAGES = "report_images"
    PROJECTIONS_ORIGINAL = "original_projections"
    PROJECTIONS_AVATARS = "avatar_projections"
    METADATA = "run_metadata"
    REPORT = "report"
    META_PRIVACY_METRIC = "meta_privacy_metric"
    META_SIGNAL_METRIC = "meta_signal_metric"
    FIGURES = "figures"
    FIGURES_METADATA = "figures_metadata"
    PRIVACY_METRICS_SUMMARY = "privacy_metrics_summary"
    SIGNAL_METRICS_SUMMARY = "signal_metrics_summary"




[docs]
class PlotKind(StrEnum):
    """Available plot types for visualization."""

    CORRELATION = "correlation"
    """A correlation heatmap of the original and avatar data."""

    CORRELATION_DIFFERENCE = "correlation_difference"
    """A heatmap of the difference between the original and avatar data."""

    CONTRIBUTION = "contribution"
    """A bar chart showing the contribution of each feature in the model."""

    PROJECTION_2D = "2d_projection"
    """A 2D projection of the original and avatar data."""

    PROJECTION_3D = "3d_projection"
    """A 3D projection of the original and avatar data."""

    DISTRIBUTION = "distribution"
    """Distributions plot of the original and avatar data, there is a plot for each column."""

    AGGREGATE_STATS = "aggregate_stats"
    """A table containing the mean and std of the original and avatar data
    (of the 10 first columns)."""

    RAW_SERIES = "raw_series"
    """A line plot of the original and avatar time series over time."""

    NORMALIZED_SERIES = "normalized_series"
    """A line plot of the normalized original and avatar time series over time."""

    CLASS_PROJECTION_2D = "class_projection_2d"
    """A 2D projection colored by the target class
    (only available with class balancing augmentation).
    """
    METRICS_SUMMARY = "metrics_summary"
    """A summary table of privacy metrics."""



RESULTS_TO_STORE = [
    Results.SHUFFLED,
    Results.UNSHUFFLED,
    Results.PRIVACY_METRICS,
    Results.SIGNAL_METRICS,
    Results.PROJECTIONS_ORIGINAL,
    Results.PROJECTIONS_AVATARS,
    Results.METADATA,
    Results.FIGURES,
    Results.PRIVACY_METRICS_SUMMARY,
    Results.SIGNAL_METRICS_SUMMARY,
]

type TypeResults = dict | pd.DataFrame | str | list[dict[str, Any]] | None | HTML

MATCHERS: dict[re.Pattern[str], ColumnType] = {
    re.compile(r"float"): ColumnType.NUMERIC,
    re.compile(r"int"): ColumnType.INT,
    re.compile(r"bool"): ColumnType.BOOL,
    re.compile(r"datetime"): ColumnType.DATETIME,
    re.compile(r"datetime64\[ns, UTC\]"): ColumnType.DATETIME_TZ,
    # FIXME: implement bool ColumnType
}

DEFAULT_TYPE = ColumnType.CATEGORY

mapping_result_to_file_name = {
    Results.ADVICE: "advice.json",
    Results.SHUFFLED: "shuffled",
    Results.UNSHUFFLED: "unshuffled",
    Results.PRIVACY_METRICS: "privacy.json",
    Results.SIGNAL_METRICS: "signal.json",
    Results.PROJECTIONS_ORIGINAL: "projections.original",
    Results.PROJECTIONS_AVATARS: "projections.avatars",
    Results.METADATA: "run_metadata.json",
    Results.REPORT: "report.md",
    Results.PRIVACY_METRICS_SUMMARY: "privacy_metrics_summary.json",
    Results.SIGNAL_METRICS_SUMMARY: "signal_metrics_summary.json",
}