Source code for icclim._core.generic.threshold.percentile

"""Percentile based threshold module."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, Callable

import xarray as xr
from xarray import DataArray, Dataset
from xclim.core.bootstrapping import percentile_bootstrap
from xclim.core.calendar import build_climatology_bounds, percentile_doy, resample_doy
from xclim.core.units import convert_units_to
from xclim.core.utils import calc_perc

from icclim._core.constants import (
    DEFAULT_DOY_WINDOW,
    DOY_COORDINATE,
    DOY_PERCENTILE_UNIT,
    PERIOD_PERCENTILE_UNIT,
    UNITS_KEY,
)
from icclim._core.generic.threshold.threshold_templates import (
    EN_THRESHOLD_TEMPLATE,
    PercentileTemplateConfig,
    ThresholdMetadata,
)
from icclim._core.input_parsing import (
    PercentileDataArray,
    build_reference_da,
    get_dataarray_from_dataset,
    is_dataset_path,
    read_clim_bounds,
    standardize_percentile_dim_name,
)
from icclim._core.model.quantile_interpolation import (
    QuantileInterpolation,
    QuantileInterpolationRegistry,
)

if TYPE_CHECKING:
    from collections.abc import Sequence
    from datetime import datetime

    import jinja2
    import pint

    from icclim._core.model.operator import Operator
    from icclim.frequency import Frequency

from typing import TYPE_CHECKING

from icclim._core.model.threshold import Threshold, ThresholdValueType


[docs] class PercentileThreshold(Threshold): """ Percentile based threshold (e.g. "<= 10 doy_per"). The percentiles to be computed are expected to be either: * "doy percentiles" (unit: "doy_per"). They are usually used for temperatures indices such as the ECAD :ref:`tx90p <ecad_functions_api>`. These percentiles are computed per day of year (doy) and by aggregating values on the time axis ranged by ``reference_period``, using the ``doy_window_width`` parameter to control the time axis window of aggregation. The resulting `value` is a DataArray with a "dayofyear" dimension ranging from 0 to 365 with one value per day of the year. * "period percentiles" (unit: "period_per"). They are usually used for liquide precipitation indices such as the ECAD :ref:`r75p <ecad_functions_api>` or even :ref:`r75ptot <ecad_functions_api>`. These percentiles are computed per grid cell on the period ranged by ``reference_period``. The resulting ``value`` is a DataArray with per grid cell values and no time axis. ``is_ready`` becomes True when `prepare` method has been called, the actual percentiles are then computed and accessible in ``value`` property. Once ``is_ready`` is True, ``unit`` property can be set and will attempt a pint unit conversion similar to what is done on ``BasicThreshold``. Before that, setting unit has no effect. """ reference_period: Sequence[str] doy_window_width: int only_leap_years: bool interpolation: QuantileInterpolation initial_value: list[float] | None is_doy_per_threshold: bool _prepared_value: PercentileDataArray _initial_unit: str | None @property
[docs] def unit(self) -> str | None: """The unit of the threshold.""" if self.is_ready: return self._prepared_value.attrs[UNITS_KEY] return self._initial_unit
@unit.setter def unit(self, unit: str | xr.DataArray | pint.Quantity | pint.Unit) -> None: if self.is_ready: if self.value.attrs.get(UNITS_KEY, None) is not None and unit is not None: self._prepared_value = convert_units_to( self._prepared_value, unit, context="hydro", ) self.value.attrs[UNITS_KEY] = unit @property
[docs] def value(self) -> PercentileDataArray: """ The computed percentile threshold. Raises ------ RuntimeError If the threshold is not ready (prepare has not been called). """ if self.is_ready: return self._prepared_value msg = ( "Property `value` is not ready. For PercentileDataArray," " you must call `.prepare` first and fill `studied_data`" " parameter in order to prepare `value`." ) raise RuntimeError(msg)
def __init__( self, operator: str | Operator, value: DataArray | float | Sequence[float], unit: str | None = None, doy_window_width: int = DEFAULT_DOY_WINDOW, only_leap_years: bool = False, interpolation: QuantileInterpolation | str = QuantileInterpolationRegistry.MEDIAN_UNBIASED, reference_period: Sequence[datetime | str] | None = None, threshold_min_value: pint.Quantity | None = None, initial_query: str | None = None, threshold_var_name: str | None = None, **kwargs, # noqa: ARG002 ) -> None: if is_dataset_path(value) or isinstance(value, Dataset): value, is_doy_per_threshold = _build_per_thresh_from_dataset( value=value, unit=unit, threshold_var_name=threshold_var_name, reference_period=reference_period, ) else: is_doy_per_threshold = unit == DOY_PERCENTILE_UNIT if isinstance(value, DataArray): self.prepare = None self._prepared_value = PercentileDataArray.from_da(value) self.is_ready = True self._initial_unit = None self.initial_value = None self.unit = self._prepared_value.attrs[UNITS_KEY] else: self.is_ready = False self._initial_unit = unit if not isinstance(value, list): value = [value] self.initial_value = [float(x) for x in value] self.unit = unit self.operator = operator self.threshold_var_name = threshold_var_name self.initial_query = initial_query self.threshold_min_value = threshold_min_value self.reference_period = reference_period self.doy_window_width = doy_window_width self.only_leap_years = only_leap_years self.interpolation = QuantileInterpolationRegistry.lookup(interpolation) self.unit = unit self.is_doy_per_threshold = is_doy_per_threshold
[docs] def prepare(self, studied_data: DataArray) -> None: """ Prepare the data for calculating percentiles. Parameters ---------- studied_data : DataArray The input data to be prepared. Raises ------ NotImplementedError If the percentile unit is unknown. Returns ------- None """ if self._initial_unit == DOY_PERCENTILE_UNIT: prepared_data = _build_doy_per( studied_data=studied_data, per_val=self.initial_value, reference_period=self.reference_period, interpolation=self.interpolation, only_leap_years=self.only_leap_years, doy_window_width=self.doy_window_width, percentile_min_value=self.threshold_min_value, ) elif self._initial_unit == PERIOD_PERCENTILE_UNIT: prepared_data = _build_period_per( studied_data=studied_data, per_val=self.initial_value, reference_period=self.reference_period, interpolation=self.interpolation, only_leap_years=self.only_leap_years, percentile_min_value=self.threshold_min_value, ) else: msg = f"Unknown percentile unit '{self._initial_unit}'." raise NotImplementedError(msg) self._prepared_value = prepared_data.chunk("auto") self.is_ready = True
def __eq__(self, other: object) -> bool: """ Compare two PercentileThreshold objects for equality. Parameters ---------- other : object The object to compare with. Returns ------- bool True if the objects are equal, False otherwise. """ return ( isinstance(other, PercentileThreshold) and self.initial_query == other.initial_query and self.doy_window_width == other.doy_window_width and self.only_leap_years == other.only_leap_years and self.interpolation == other.interpolation and self.reference_period == other.reference_period and self.unit == other.unit and self.threshold_min_value == other.threshold_min_value ) # noinspection PyMethodOverriding # (reason: with * and **kwargs we can have a different signature while still # being liskov proof)
[docs] def format_metadata( self, *, jinja_scope: dict[str, Any], jinja_env: jinja2.Environment, src_freq: Frequency, must_run_bootstrap: bool = False, **kwargs, # noqa: ARG002 ) -> ThresholdMetadata: """ Generate the metadata for the threshold. These metadata are used to fill the generic template. Parameters ---------- jinja_scope : dict The jinja scope, it contains the variables to be used in the jinja template. jinja_env : jinja2.Environment The jinja environment, it contains the jinja rendering engine. src_freq : Frequency The frequency of the source data. must_run_bootstrap : bool, optional Whether to run bootstrap, by default False. Returns ------- ThresholdMetadata The metadata for the threshold. """ per_coord = self.value.coords["percentiles"] templates = self._get_metadata_templates(per_coord) climatology_bounds: list[str] = self.value.attrs.get("climatology_bounds") conf: PercentileTemplateConfig = { "climatology_bounds": climatology_bounds, "doy_window_width": self.doy_window_width, "src_freq": src_freq, "operator": self.operator, "unit": self.unit, "per_coord": per_coord.values, "threshold_min_value": self.threshold_min_value, "must_run_bootstrap": must_run_bootstrap, } conf.update(jinja_scope) return { k: jinja_env.from_string(v, globals=conf).render() for k, v in templates.items() }
[docs] def compute( self, comparison_data: xr.DataArray, override_op: Callable[[DataArray, DataArray], DataArray] | None = None, **kwargs, ) -> DataArray: """ Compute the percentile threshold. Parameters ---------- comparison_data : xr.DataArray The data array to compare with the threshold. override_op : Callable[[DataArray, DataArray], DataArray] | None, optional An optional override operator to use instead of the default operator. **kwargs Additional keyword arguments. The `freq` parameter is used to specify the frequency of the data. The `bootstrap` parameter is used to specify whether to run bootstrap. Returns ------- DataArray The computed percentile threshold. Raises ------ RuntimeError If the PercentileThreshold is not ready. You must first call `.prepare` with a `studied_data` parameter in order to prepare the threshold for computation. """ op = override_op if override_op is not None else self.operator if self.is_ready: return self._per_compute( comparison_data, self.value, op, self.is_doy_per_threshold, kwargs.get("freq", None), kwargs.get("bootstrap", False), ) msg = ( "This PercentileThreshold is not ready. You must first call `.prepare`" " with a `studied_data` parameter in order to prepare the threshold" " for computation." ) raise RuntimeError(msg)
def _get_metadata_templates(self, per_coord: DataArray) -> ThresholdMetadata: if self.is_doy_per_threshold: if per_coord.size == 1: return EN_THRESHOLD_TEMPLATE["single_doy_percentile"] return EN_THRESHOLD_TEMPLATE["multiple_doy_percentiles"] if per_coord.size == 1: return EN_THRESHOLD_TEMPLATE["single_period_percentile"] return EN_THRESHOLD_TEMPLATE["multiple_period_percentiles"] @percentile_bootstrap def _per_compute( self, comparison_data: xr.DataArray, per: xr.DataArray, op: Callable[[DataArray, DataArray], DataArray], is_doy_per_threshold: bool, freq: str, # noqa: ARG002 used by @percentile_bootstrap bootstrap: bool, # noqa: ARG002 used by @percentile_bootstrap ) -> DataArray: if self.threshold_min_value is not None: # there is only a threshold_min_value when we are computing > or >= thresh = self.threshold_min_value thresh = convert_units_to(thresh, per, context="hydro") per = per.where(per > thresh, thresh) if is_doy_per_threshold: threshold_value = resample_doy(per, comparison_data) else: threshold_value = per return op(comparison_data, threshold_value)
def _compute_per( per_val: float, alpha: float, beta: float, study: DataArray ) -> PercentileDataArray: computed_per = xr.apply_ufunc( calc_perc, study, input_core_dims=[["time"]], output_core_dims=[["percentiles"]], keep_attrs=True, kwargs={ "percentiles": per_val, "alpha": alpha, "beta": beta, "copy": True, }, dask="parallelized", output_dtypes=[study.dtype], dask_gufunc_kwargs={"output_sizes": {"percentiles": 1}, "allow_rechunk": True}, ) computed_per = computed_per.assign_coords( percentiles=xr.DataArray(per_val, dims=("percentiles",)), ) return PercentileDataArray.from_da( source=computed_per, climatology_bounds=build_climatology_bounds(study), ) def _build_period_per( studied_data: DataArray, per_val: Sequence[float], reference_period: Sequence[str], interpolation: QuantileInterpolation, only_leap_years: bool, percentile_min_value: pint.Quantity | None, ) -> PercentileDataArray: reference = build_reference_da( studied_data, reference_period, only_leap_years, percentile_min_value=percentile_min_value, ) return _compute_per(per_val, interpolation.alpha, interpolation.beta, reference) def _build_doy_per( studied_data: DataArray, per_val: Sequence[float], reference_period: Sequence[str], interpolation: QuantileInterpolation, only_leap_years: bool, doy_window_width: int, percentile_min_value: pint.Quantity | None, ) -> PercentileDataArray: reference = build_reference_da( studied_data, reference_period, only_leap_years, percentile_min_value, ) return percentile_doy( arr=reference, window=doy_window_width, per=per_val, alpha=interpolation.alpha, beta=interpolation.beta, ).compute() # "optimization" (diminish dask scheduler workload) def _build_per_thresh_from_dataset( value: ThresholdValueType, unit: str | None, threshold_var_name: str, reference_period: Sequence[datetime | str], ) -> tuple[DataArray, bool]: thresh_da = get_dataarray_from_dataset(threshold_var_name, value) built_value = PercentileDataArray.from_da( standardize_percentile_dim_name(thresh_da), read_clim_bounds(reference_period, thresh_da), ) if unit is not None: if built_value.attrs.get(UNITS_KEY, None) is not None: built_value = convert_units_to(built_value, unit, context="hydro") built_value.attrs[UNITS_KEY] = unit return built_value, DOY_COORDINATE in built_value.coords