o
    Eb?K                  	   @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
 ddlmZ ddlmZ G d	d
 d
eZdd Zd$ddZd%ddZdd Zdd Zdd Zdd ZddgZe
deZddd dddd dd!d"d#ZdS )&    N)check_random_state)ndtrndtri)rng_integers)make_dataclass   )ConfidenceInterval)_broadcast_concatenatec                   @   s   e Zd ZdZdddZdS )&BootstrapDegenerateDistributionWarningzt
    Warning generated by `bootstrap` when BCa method is used and
    the bootstrap distribution is degenerate.
    Nc                 C   s   |d u rd}|f| _ d S )NzQThe bootstrap distribution is degenerate; the confidence interval is not defined.)args)selfmsg r   8/usr/lib/python3/dist-packages/scipy/stats/_bootstrap.py__init__   s   z/BootstrapDegenerateDistributionWarning.__init__N)__name__
__module____qualname____doc__r   r   r   r   r   r
      s    r
   c                    s   dd fdd
}|S )zVectorize an n-sample statisticr   axisc                    sN    fdd|D }t |d d t| }fdd}t | |d S )Nc                    s   g | ]}|j   qS r   )shape.0sampler   r   r   
<listcomp>   s    z9_vectorize_statistic.<locals>.stat_nd.<locals>.<listcomp>c                    s   t |  }| S r   )npsplit)zdata)split_indices	statisticr   r   stat_1d!   s   z6_vectorize_statistic.<locals>.stat_nd.<locals>.stat_1dr   )r   Zcumsumr	   Zapply_along_axis)r   r!   Zlengthsr    r$   r#   )r   r"   r   stat_nd   s
   
z%_vectorize_statistic.<locals>.stat_ndr   )r#   r&   r   r%   r   _vectorize_statistic   s   
r'   c           	      c   s    | j d }|p	|}td||D ]C}t||| }tj||ftd}t|dd||| f d t|}t|||f}|| 	||d f}| d|f }|V  qdS )z=Jackknife resample the sample. Only one-sample stats for now.r   r   ZdtypeNFr   .)
r   rangeminr   ZonesboolZfill_diagonalarangebroadcast_toZreshape)	r   batchnbatch_nominalkbatch_actualji	resamplesr   r   r   _jackknife_resample)   s   
 
r6   c                 C   s,   | j d }t|d|||f}| d|f }|S )zBootstrap resample the sample.r   r   .)r   r   )r   n_resamplesrandom_stater/   r4   r5   r   r   r   _bootstrap_resample=   s   
r9   c                 C   s   | j | }| |k j|d| S )zVectorized, simplified `scipy.stats.percentileofscore`.

    Unlike `stats.percentileofscore`, the percentile returned is a fraction
    in [0, 1].
    r   )r   sum)aZscorer   Br   r   r   _percentile_of_scoreH   s   
r=   c                 C   s   | j dd }t||}tj|tjd}t|D ]!\}}t|r/tt	  tj
||< q| | }t||||< q|d S )z9`np.percentile` with different percentile for each slice.Nr   r(   r   )r   r   r-   Z
zeros_likeZfloat64ZndenumerateZisnanwarningswarnr
   nan
percentile)theta_hat_balphar   ZpercentilesindicesZalpha_iZtheta_hat_b_ir   r   r   _percentile_along_axisR   s   
rE   c                 C   s   | d }|||dd }t ||dd}t|}	g }
t||D ]}|
||dd qtj|
dd}
|
jddd}||
 d jdd}d||
 d	 jddd
  }|| }t|}| }|	| }t|	|d||    }|	| }t|	|d||    }||fS )z(Bias-corrected and accelerated interval.r   r   ).Nr   T)r   Zkeepdims         g      ?r   )	r=   r   r6   appendr   concatenateZmeanr:   r   )r!   r#   r   rC   rB   r.   r   	theta_hatrA   Zz0_hatZtheta_hat_iZjackknife_sampleZtheta_hat_dotZnumZdenZa_hatZz_alphaZz_1alphaZnum1Zalpha_1Znum2Zalpha_2r   r   r   _bca_intervale   s&   rL   c
              
   C   s  |dvrt d|st|}t|}
||
krt dd}zt| }W n ty-   t dw |dkr6t dg }| D ]}t|}|j|
 dkrLt dt||
d	}|	| q:|dvrat d
|r|d jd	 }|dd D ]}|jd	 |krd}t |qpd	||fdd}t
|g}t|}t|}||ks|dkrt d|du r|}nt|}||ks|dkrt dh d}| }||vrt d| d}|s|dkr|dkrt |t|	}	|||||
|||||	f
S )z5Input validation and standardization for `bootstrap`.>   FTz'`vectorized` must be `True` or `False`.z`axis` must be an integer.r   z%`data` must be a sequence of samples.z(`data` must contain at least one sample.r   zIeach sample in `data` must contain two or more observations along `axis`.r   z#`paired` must be `True` or `False`.NzIWhen `paired is True`, all samples must have the same length along `axis`c                    s     fdd|D }||d|iS )Nc                    s   g | ]}|d  f qS ).r   r   r4   r   r   r      s    z4_bootstrap_iv.<locals>.statistic.<locals>.<listcomp>r   r   )r4   r   r!   Zunpaired_statisticr   rM   r   r#      s   z _bootstrap_iv.<locals>.statisticz)`n_resamples` must be a positive integer.z+`batch` must be a positive integer or None.>   bcabasicrA   z`method` must be in z;`method = 'BCa' is only available for one-sample statisticsrN   )
ValueErrorr'   intlen	TypeErrorr   Z
atleast_1dr   ZmoveaxisrI   r,   floatlowerr   )r!   r#   
vectorizedpairedr   confidence_levelr7   r.   methodr8   Zaxis_intZ	n_samplesZdata_ivr   r/   messageZconfidence_level_floatZn_resamples_intZbatch_ivmethodsr   r   r   _bootstrap_iv   sl   

r\   confidence_intervalstandard_errorBootstrapResultTFgffffff?i'  ZBCa)rV   rW   r   rX   r7   r.   rY   r8   c                C   sn  t | |||||||||	
}
|
dd \} }}}}|
dd \}}}}}	g }|p(|}td||D ]&}t||| }g }| D ]}t|||	d}|| q<|||ddi q/tj|dd}d| d	 }|d
krtt| |d|||d}t}n
|d| f}dd }|||d d }|||d d }|dkr|| ddi}d	| | d	| | }}t	t
||tj|ddddS )a%  
    Compute a two-sided bootstrap confidence interval of a statistic.

    When `method` is ``'percentile'``, a bootstrap confidence interval is
    computed according to the following procedure.

    1. Resample the data: for each sample in `data` and for each of
       `n_resamples`, take a random sample of the original sample
       (with replacement) of the same size as the original sample.

    2. Compute the bootstrap distribution of the statistic: for each set of
       resamples, compute the test statistic.

    3. Determine the confidence interval: find the interval of the bootstrap
       distribution that is

       - symmetric about the median and
       - contains `confidence_level` of the resampled statistic values.

    While the ``'percentile'`` method is the most intuitive, it is rarely
    used in practice. Two more common methods are available, ``'basic'``
    ('reverse percentile') and ``'BCa'`` ('bias-corrected and accelerated');
    they differ in how step 3 is performed.

    If the samples in `data` are  taken at random from their respective
    distributions :math:`n` times, the confidence interval returned by
    `bootstrap` will contain the true value of the statistic for those
    distributions approximately `confidence_level`:math:`\, \times \, n` times.

    Parameters
    ----------
    data : sequence of array-like
         Each element of data is a sample from an underlying distribution.
    statistic : callable
        Statistic for which the confidence interval is to be calculated.
        `statistic` must be a callable that accepts ``len(data)`` samples
        as separate arguments and returns the resulting statistic.
        If `vectorized` is set ``True``,
        `statistic` must also accept a keyword argument `axis` and be
        vectorized to compute the statistic along the provided `axis`.
    vectorized : bool, default: ``True``
        If `vectorized` is set ``False``, `statistic` will not be passed
        keyword argument `axis`, and is assumed to calculate the statistic
        only for 1D samples.
    paired : bool, default: ``False``
        Whether the statistic treats corresponding elements of the samples
        in `data` as paired.
    axis : int, default: ``0``
        The axis of the samples in `data` along which the `statistic` is
        calculated.
    confidence_level : float, default: ``0.95``
        The confidence level of the confidence interval.
    n_resamples : int, default: ``9999``
        The number of resamples performed to form the bootstrap distribution
        of the statistic.
    batch : int, optional
        The number of resamples to process in each vectorized call to
        `statistic`. Memory usage is O(`batch`*``n``), where ``n`` is the
        sample size. Default is ``None``, in which case ``batch = n_resamples``
        (or ``batch = max(n_resamples, n)`` for ``method='BCa'``).
    method : {'percentile', 'basic', 'bca'}, default: ``'BCa'``
        Whether to return the 'percentile' bootstrap confidence interval
        (``'percentile'``), the 'reverse' or the bias-corrected and accelerated
        bootstrap confidence interval (``'BCa'``).
        Note that only ``'percentile'`` and ``'basic'`` support multi-sample
        statistics at this time.
    random_state : {None, int, `numpy.random.Generator`,
                    `numpy.random.RandomState`}, optional

        Pseudorandom number generator state used to generate resamples.

        If `random_state` is ``None`` (or `np.random`), the
        `numpy.random.RandomState` singleton is used.
        If `random_state` is an int, a new ``RandomState`` instance is used,
        seeded with `random_state`.
        If `random_state` is already a ``Generator`` or ``RandomState``
        instance then that instance is used.

    Returns
    -------
    res : BootstrapResult
        An object with attributes:

        confidence_interval : ConfidenceInterval
            The bootstrap confidence interval as an instance of
            `collections.namedtuple` with attributes `low` and `high`.
        standard_error : float or ndarray
            The bootstrap standard error, that is, the sample standard
            deviation of the bootstrap distribution

    Notes
    -----
    Elements of the confidence interval may be NaN for ``method='BCa'`` if
    the bootstrap distribution is degenerate (e.g. all elements are identical).
    In this case, consider using another `method` or inspecting `data` for
    indications that other analysis may be more appropriate (e.g. all
    observations are identical).

    References
    ----------
    .. [1] B. Efron and R. J. Tibshirani, An Introduction to the Bootstrap,
       Chapman & Hall/CRC, Boca Raton, FL, USA (1993)
    .. [2] Nathaniel E. Helwig, "Bootstrap Confidence Intervals",
       http://users.stat.umn.edu/~helwig/notes/bootci-Notes.pdf
    .. [3] Bootstrapping (statistics), Wikipedia,
       https://en.wikipedia.org/wiki/Bootstrapping_%28statistics%29

    Examples
    --------
    Suppose we have sampled data from an unknown distribution.

    >>> import numpy as np
    >>> rng = np.random.default_rng()
    >>> from scipy.stats import norm
    >>> dist = norm(loc=2, scale=4)  # our "unknown" distribution
    >>> data = dist.rvs(size=100, random_state=rng)

    We are interested int the standard deviation of the distribution.

    >>> std_true = dist.std()      # the true value of the statistic
    >>> print(std_true)
    4.0
    >>> std_sample = np.std(data)  # the sample statistic
    >>> print(std_sample)
    3.9460644295563863

    We can calculate a 90% confidence interval of the statistic using
    `bootstrap`.

    >>> from scipy.stats import bootstrap
    >>> data = (data,)  # samples must be in a sequence
    >>> res = bootstrap(data, np.std, confidence_level=0.9,
    ...                 random_state=rng)
    >>> print(res.confidence_interval)
    ConfidenceInterval(low=3.57655333533867, high=4.382043696342881)

    If we sample from the distribution 1000 times and form a bootstrap
    confidence interval for each sample, the confidence interval
    contains the true value of the statistic approximately 900 times.

    >>> n_trials = 1000
    >>> ci_contains_true_std = 0
    >>> for i in range(n_trials):
    ...    data = (dist.rvs(size=100, random_state=rng),)
    ...    ci = bootstrap(data, np.std, confidence_level=0.9, n_resamples=1000,
    ...                   random_state=rng).confidence_interval
    ...    if ci[0] < std_true < ci[1]:
    ...        ci_contains_true_std += 1
    >>> print(ci_contains_true_std)
    875

    Rather than writing a loop, we can also determine the confidence intervals
    for all 1000 samples at once.

    >>> data = (dist.rvs(size=(n_trials, 100), random_state=rng),)
    >>> res = bootstrap(data, np.std, axis=-1, confidence_level=0.9,
    ...                 n_resamples=1000, random_state=rng)
    >>> ci_l, ci_u = res.confidence_interval

    Here, `ci_l` and `ci_u` contain the confidence interval for each of the
    ``n_trials = 1000`` samples.

    >>> print(ci_l[995:])
    [3.77729695 3.75090233 3.45829131 3.34078217 3.48072829]
    >>> print(ci_u[995:])
    [4.88316666 4.86924034 4.32032996 4.2822427  4.59360598]

    And again, approximately 90% contain the true value, ``std_true = 4``.

    >>> print(np.sum((ci_l < std_true) & (std_true < ci_u)))
    900

    `bootstrap` can also be used to estimate confidence intervals of
    multi-sample statistics, including those calculated by hypothesis
    tests. `scipy.stats.mood` perform's Mood's test for equal scale parameters,
    and it returns two outputs: a statistic, and a p-value. To get a
    confidence interval for the test statistic, we first wrap
    `scipy.stats.mood` in a function that accepts two sample arguments,
    accepts an `axis` keyword argument, and returns only the statistic.

    >>> from scipy.stats import mood
    >>> def my_statistic(sample1, sample2, axis):
    ...     statistic, _ = mood(sample1, sample2, axis=-1)
    ...     return statistic

    Here, we use the 'percentile' method with the default 95% confidence level.

    >>> sample1 = norm.rvs(scale=1, size=100, random_state=rng)
    >>> sample2 = norm.rvs(scale=2, size=100, random_state=rng)
    >>> data = (sample1, sample2)
    >>> res = bootstrap(data, my_statistic, method='basic', random_state=rng)
    >>> print(mood(sample1, sample2)[0])  # element 0 is the statistic
    -5.521109549096542
    >>> print(res.confidence_interval)
    ConfidenceInterval(low=-7.255994487314675, high=-4.016202624747605)

    The bootstrap estimate of the standard error is also available.

    >>> print(res.standard_error)
    0.8344963846318795

    Paired-sample statistics work, too. For example, consider the Pearson
    correlation coefficient.

    >>> from scipy.stats import pearsonr
    >>> n = 100
    >>> x = np.linspace(0, 10, n)
    >>> y = x + rng.uniform(size=n)
    >>> print(pearsonr(x, y)[0])  # element 0 is the statistic
    0.9962357936065914

    We wrap `pearsonr` so that it returns only the statistic.

    >>> def my_statistic(x, y):
    ...     return pearsonr(x, y)[0]

    We call `bootstrap` using ``paired=True``.
    Also, since ``my_statistic`` isn't vectorized to calculate the statistic
    along a given axis, we pass in ``vectorized=False``.

    >>> res = bootstrap((x, y), my_statistic, vectorized=False, paired=True,
    ...                 random_state=rng)
    >>> print(res.confidence_interval)
    ConfidenceInterval(low=0.9950085825848624, high=0.9971212407917498)

    N   r   )r7   r8   r   r   r   r   rH   rN   )r   rC   rB   r.   c                 S   s   t j| |ddS )Nr   )r;   qr   )r   rA   )r;   ra   r   r   r   percentile_fun  s   z!bootstrap.<locals>.percentile_fund   rO   )Zddofr   )r]   r^   )r\   r)   r*   r9   rI   r   rJ   rL   rE   r_   r   Zstd)r!   r#   rV   rW   r   rX   r7   r.   rY   r8   r   rB   r0   r1   r2   Zresampled_datar   ZresamplerC   intervalrb   Zci_lZci_urK   r   r   r   	bootstrap   sF    g

re   r   )NN)r>   Znumpyr   Zscipy._lib._utilr   Zscipy.specialr   r   r   Zdataclassesr   Z_commonr   Z_axis_nan_policyr	   RuntimeWarningr
   r'   r6   r9   r=   rE   rL   r\   Zfieldsr_   re   r   r   r   r   <module>   s,    


P
