Source code for real_simple_stats.descriptive_statistics

import math
from collections import Counter
from collections.abc import Sequence

# --- Basic Descriptive Functions ---



[docs]
def is_discrete(values: Sequence[float]) -> bool:
    """Determine if a variable is discrete (all values are integers).

    Args:
        values: List of numerical values to check

    Returns:
        True if all values are integers, False otherwise

    Example:
        >>> is_discrete([1.0, 2.0, 3.0])
        True
        >>> is_discrete([1.5, 2.0, 3.0])
        False
    """
    return all(float(v).is_integer() for v in values)




[docs]
def is_continuous(values: Sequence[float]) -> bool:
    """Determine if a variable is continuous (contains non-integer values).

    Args:
        values: List of numerical values to check

    Returns:
        True if any values are non-integers, False if all are integers

    Example:
        >>> is_continuous([1.5, 2.0, 3.0])
        True
        >>> is_continuous([1.0, 2.0, 3.0])
        False
    """
    return not is_discrete(values)




[docs]
def five_number_summary(values: Sequence[float]) -> dict[str, float]:
    """Return the five-number summary: min, Q1, median, Q3, max.

    Args:
        values: List of numerical values

    Returns:
        Dictionary with keys: min, Q1, median, Q3, max

    Raises:
        ValueError: If the input list is empty

    Example:
        >>> five_number_summary([1, 2, 3, 4, 5])
        {'min': 1, 'Q1': 1.5, 'median': 3, 'Q3': 4.5, 'max': 5}
        >>> five_number_summary([5])
        {'min': 5, 'Q1': 5, 'median': 5, 'Q3': 5, 'max': 5}
    """
    if not values:
        raise ValueError("Cannot calculate five-number summary of empty list")

    sorted_vals = sorted(values)
    n = len(sorted_vals)

    # Handle edge cases for small samples
    if n == 1:
        # Single value: all statistics equal the value
        val = sorted_vals[0]
        return {
            "min": val,
            "Q1": val,
            "median": val,
            "Q3": val,
            "max": val,
        }

    # Calculate median
    mid = n // 2
    median_val = (
        sorted_vals[mid] if n % 2 else (sorted_vals[mid - 1] + sorted_vals[mid]) / 2
    )

    # For n=2, Q1 and Q3 are the two values
    if n == 2:
        return {
            "min": sorted_vals[0],
            "Q1": sorted_vals[0],
            "median": median_val,
            "Q3": sorted_vals[1],
            "max": sorted_vals[1],
        }

    # For n=3, Q1 is min and Q3 is max
    if n == 3:
        return {
            "min": sorted_vals[0],
            "Q1": sorted_vals[0],
            "median": median_val,
            "Q3": sorted_vals[2],
            "max": sorted_vals[2],
        }

    # Standard calculation for n >= 4
    lower_half = sorted_vals[:mid]
    upper_half = sorted_vals[mid + 1 :] if n % 2 else sorted_vals[mid:]
    Q1 = median(lower_half)
    Q3 = median(upper_half)

    return {
        "min": sorted_vals[0],
        "Q1": Q1,
        "median": median_val,
        "Q3": Q3,
        "max": sorted_vals[-1],
    }




[docs]
def median(values: Sequence[float]) -> float:
    """Calculate the median (middle value) of a dataset.

    Args:
        values: List of numerical values

    Returns:
        The median value

    Raises:
        ValueError: If the input list is empty

    Example:
        >>> median([1, 2, 3, 4, 5])
        3.0
        >>> median([1, 2, 3, 4])
        2.5
    """
    if not values:
        raise ValueError("Cannot calculate median of empty list")
    sorted_vals = sorted(values)
    n = len(sorted_vals)
    mid = n // 2
    return sorted_vals[mid] if n % 2 else (sorted_vals[mid - 1] + sorted_vals[mid]) / 2




[docs]
def interquartile_range(values: Sequence[float]) -> float:
    summary = five_number_summary(values)
    return summary["Q3"] - summary["Q1"]




[docs]
def sample_variance(values: Sequence[float]) -> float:
    """Calculate the sample variance of a dataset.

    Uses the sample variance formula with (n-1) degrees of freedom (Bessel's correction).

    Args:
        values: List of numerical values

    Returns:
        The sample variance

    Raises:
        ValueError: If fewer than 2 values are provided

    Example:
        >>> sample_variance([1, 2, 3, 4, 5])
        2.5
    """
    if len(values) < 2:
        raise ValueError("Sample variance requires at least 2 values")
    m = sum(values) / len(values)
    return sum((x - m) ** 2 for x in values) / (len(values) - 1)




[docs]
def sample_std_dev(values: Sequence[float]) -> float:
    """Calculate the sample standard deviation of a dataset.

    Args:
        values: List of numerical values

    Returns:
        The sample standard deviation (square root of sample variance)

    Raises:
        ValueError: If fewer than 2 values are provided

    Example:
        >>> sample_std_dev([1, 2, 3, 4, 5])
        1.5811388300841898
    """
    return math.sqrt(sample_variance(values))




[docs]
def coefficient_of_variation(values: Sequence[float]) -> float:
    mean_val = mean(values)
    if mean_val == 0:
        raise ValueError("Cannot calculate coefficient of variation when mean is zero")
    return sample_std_dev(values) / mean_val




[docs]
def mean(values: Sequence[float]) -> float:
    """Calculate the arithmetic mean (average) of a dataset.

    Args:
        values: List of numerical values

    Returns:
        The arithmetic mean

    Raises:
        ValueError: If the input list is empty

    Example:
        >>> mean([1, 2, 3, 4, 5])
        3.0
    """
    if not values:
        raise ValueError("Cannot calculate mean of empty list")
    return sum(values) / len(values)




[docs]
def draw_frequency_table(
    values: Sequence[str | int],
) -> dict[str | int, int]:
    """Generate a frequency table from a list of categorical or discrete values.

    Args:
        values: List of categorical or discrete values to count

    Returns:
        Dictionary mapping each unique value to its frequency

    Example:
        >>> draw_frequency_table(['A', 'B', 'A', 'C', 'B', 'A'])
        {'A': 3, 'B': 2, 'C': 1}
    """
    return dict(Counter(values))




[docs]
def draw_cumulative_frequency_table(values: Sequence[int]) -> dict[int, int]:
    """Generate a cumulative frequency table from a list of discrete values.

    Args:
        values: List of discrete integer values

    Returns:
        Dictionary mapping each unique value to its cumulative frequency

    Example:
        >>> draw_cumulative_frequency_table([1, 2, 1, 3, 2, 1])
        {1: 3, 2: 5, 3: 6}
    """
    freq = Counter(values)
    sorted_keys = sorted(freq)
    cumulative: dict[int, int] = {}
    total = 0
    for k in sorted_keys:
        total += freq[k]
        cumulative[k] = total
    return cumulative




[docs]
def detect_fake_statistics(
    survey_sponsor: str, is_voluntary: bool, correlation_not_causation: bool
) -> list[str]:
    """Detect potential issues with statistical claims or studies.

    Args:
        survey_sponsor: Organization sponsoring the survey/study
        is_voluntary: Whether the survey uses voluntary response sampling
        correlation_not_causation: Whether correlation is being presented as causation

    Returns:
        List of warning messages about potential statistical issues

    Example:
        >>> detect_fake_statistics("Diet Pill Company", True, True)
        ['Potential bias: Self-funded study', 'Warning: Voluntary response samples are biased',
         'Warning: Correlation does not imply causation']
    """
    warnings: list[str] = []
    if survey_sponsor.lower() in {
        "diet pill company",
        "political campaign",
        "egg company",
    }:
        warnings.append("Potential bias: Self-funded study")
    if is_voluntary:
        warnings.append("Warning: Voluntary response samples are biased")
    if correlation_not_causation:
        warnings.append("Warning: Correlation does not imply causation")
    return warnings