ahvn.utils.basic.rnd_utils 源代码

"""\
Random utilities with stable seeding that don't interfere with global random state.
"""

__all__ = [
    "stable_rnd",
    "stable_rndint",
    "stable_shuffle",
    "stable_split",
    "stable_sample",
    "stable_rnd_vector",
]

import heapq
import random
from typing import Iterable, List, Any, Tuple, Optional
from .hash_utils import md5hash



[文档]
def stable_rnd(seed: Optional[int] = 42) -> float:
    """\
    Generate a random float in [0.0, 1.0) without affecting the global random state.

    Args:
        seed (int, optional): Seed for deterministic random. Default is 42.
            If None, uses no salt (unstable). Nevertheless, it is strongly recommended
            to pass in a non-null `seed` value to ensure stability.

    Returns:
        float: Random float in [0.0, 1.0).
    """
    current_state = random.getstate()
    if seed is not None:
        random.seed(seed)
    result = random.random()
    random.setstate(current_state)
    return result




[文档]
def stable_rndint(min: int, max: int, seed: Optional[int] = 42) -> int:
    """\
    Generate a random integer between min and max (inclusive) without affecting the global random state.

    Args:
        min (int): Minimum value (inclusive).
        max (int): Maximum value (inclusive).
        seed (int, optional): Seed for deterministic random. Default is 42.
            If None, uses no salt (unstable). Nevertheless, it is strongly recommended
            to pass in a non-null `seed` value to ensure stability.

    Returns:
        int: Random integer in the specified range.
    """
    current_state = random.getstate()
    if seed is not None:
        random.seed(seed)
    result = random.randint(min, max)
    random.setstate(current_state)
    return result




[文档]
def stable_shuffle(seq: Iterable[Any], inplace: bool = False, seed: Optional[int] = 42) -> List[Any]:
    """\
    Shuffle a sequence without affecting the global random state.

    Args:
        seq (Iterable[Any]): The sequence to shuffle.
        inplace (bool, optional): If True, shuffle the input sequence in place
            (only works if seq is a mutable sequence). Default is False.
        seed (int, optional): Seed for deterministic shuffling. Default is 42.
            If None, uses no salt (unstable). Nevertheless, it is strongly recommended
            to pass in a non-null `seed` value to ensure stability.

    Returns:
        List[Any]: A new shuffled list containing the elements from seq.
    """
    if inplace and isinstance(seq, list):
        result = seq
    else:
        result = [s for s in seq]
    current_state = random.getstate()
    if seed is not None:
        random.seed(seed)
    random.shuffle(result)
    random.setstate(current_state)
    return result




[文档]
def stable_split(seq: Iterable[Any], r: float = 0.10, seed: Optional[int] = 42) -> Tuple[List[Any], List[Any]]:
    """\
    Split a sequence into two parts based on a stable hash-based selection.

    This function creates a stable split that is resilient to adding/removing items.
    Items are selected for the first group based on their hash values, ensuring
    that the same items are consistently selected even when the sequence changes.

    It is worth addressing that the actual ratio of the split may not be exactly `r`
    due to the discrete nature of item selection based on hash values. However,
    over large datasets, the ratio should approximate `r`.
    To get an exact ratio/count, consider using `stable_sample` instead.

    Args:
        seq (Iterable[Any]): The sequence to split.
        r (float, optional): Ratio for the first group (default: 0.10 for 10%).
        seed (int, optional): Seed for deterministic splitting. Default is 42.
            If None, uses no salt (unstable). Nevertheless, it is strongly recommended
            to pass in a non-null `seed` value to ensure stability.

    Returns:
        Tuple[List[Any], List[Any]]: A tuple containing (selected_items, remaining_items).
    """
    if r == 0:
        return list(), [s for s in seq]
    P = 1061109589
    smaller, larger = list(), list()
    for item in seq:
        if md5hash(item, salt=seed) % P <= (P - 1) * r:
            smaller.append(item)
        else:
            larger.append(item)
    return smaller, larger




[文档]
def stable_sample(seq: Iterable[Any], n: int, seed: Optional[int] = 42) -> List[Any]:
    """\
    Sample n elements without replacement in a stable manner using min n of hash values.

    This function creates a stable sample that is resilient to adding/removing items.
    Items are selected based on their hash values, ensuring that the same items are
    consistently selected even when the sequence changes, as long as n remains the same.

    Args:
        seq (Iterable[Any]): The sequence to sample from.
        n (int): Number of elements to sample.
        seed (int, optional): Seed for deterministic sampling. Default is 42.
            If None, uses no salt (unstable). Nevertheless, it is strongly recommended
            to pass in a non-null `seed` value to ensure stability.

    Returns:
        List[Any]: A list containing the sampled elements.
    """
    if n <= 0:
        return list()
    if n >= len(seq):
        return [s for s in seq]
    P = 1061109589
    mapping = {md5hash(item, salt=seed) % P: item for item in seq}
    return [mapping[h] for h in heapq.nsmallest(n, mapping.keys())]




[文档]
def stable_rnd_vector(seed: Optional[int] = 42, dim: int = 384, major_ratio: float = 0.7) -> List[float]:
    """\
    Generate a stable random vector with a major value on a hashed dimension.

    This function creates a deterministic embedding-like vector where:
    - One dimension (determined by hashing the seed) has a major value
    - Other dimensions have small random values
    - The entire vector is normalized via softmax then L2 normalization to unit length

    This two-stage normalization (softmax followed by L2) better approximates the
    distribution of real embeddings compared to direct L2 normalization.

    This is useful for creating mock embeddings in tests where you need
    deterministic but varied vectors that approximate the behavior of real embeddings.

    Args:
        seed (int, optional): Seed value for deterministic generation. Default is 42.
            If None, uses 42 as default to ensure stability.
        dim (int, optional): Dimensionality of the vector. Default is 384
            (common embedding dimension).
        major_ratio (float, optional): Approximate ratio of the major dimension
            before normalization. Default is 0.7. The major dimension will have
            this value while others have small random values, then the whole
            vector is normalized via softmax + L2.

    Returns:
        List[float]: A normalized vector of length `dim` with unit L2 norm.

    Example:
        >>> vec1 = stable_rnd_vector(seed=123, dim=5)
        >>> vec2 = stable_rnd_vector(seed=123, dim=5)
        >>> vec1 == vec2  # Same seed produces same vector
        True
        >>> vec3 = stable_rnd_vector(seed=456, dim=5)
        >>> vec1 != vec3  # Different seed produces different vector
        True
    """
    import math

    # Use 42 as default seed if None provided
    if seed is None:
        seed = 42

    # Hash the seed to determine the major dimension
    hash_val = md5hash(seed)
    major_dim = hash_val % dim

    # Save current random state
    current_state = random.getstate()

    # Use hash as seed for reproducibility
    random.seed(hash_val)

    # Generate small random values for all dimensions
    vector = [random.uniform(0.01, 0.1) for _ in range(dim)]

    # Set the major dimension to a larger value
    vector[major_dim] = major_ratio

    # Apply softmax for initial normalization
    max_val = max(vector)
    exp_vector = [math.exp(x - max_val) for x in vector]  # Subtract max for numerical stability
    exp_sum = sum(exp_vector)
    softmax_vector = [x / exp_sum for x in exp_vector]

    # Apply L2 normalization to unit length
    magnitude = sum(x * x for x in softmax_vector) ** 0.5
    normalized = [x / magnitude for x in softmax_vector]

    # Restore random state
    random.setstate(current_state)

    return normalized