Source code for icclim._core.climate_variable

"""
Contain the ClimateVariable class and its related functions.

A climate variable is a structure that contains all the pre-processed input varaible to
compute a climate index.
A climate index may require one or more climate variables to be computed.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import TYPE_CHECKING, Any

import numpy as np
import xarray

from icclim._core.constants import REFERENCE_PERIOD_INDEX, UNITS_KEY
from icclim._core.generic.threshold.percentile import PercentileThreshold
from icclim._core.input_parsing import (
    DEFAULT_INPUT_FREQUENCY,
    build_reference_da,
    build_studied_data,
    guess_standard_variable,
    read_dataset,
)
from icclim.exception import InvalidIcclimArgumentError
from icclim.frequency import Frequency, FrequencyRegistry
from icclim.threshold.factory import build_threshold

if TYPE_CHECKING:
    from collections.abc import Sequence
    from datetime import datetime

    import jinja2
    from xarray.core.dataarray import DataArray

    from icclim._core.model.global_metadata import GlobalMetadata
    from icclim._core.model.icclim_types import InFileBaseType
    from icclim._core.model.in_file_dictionary import InFileDictionary
    from icclim._core.model.standard_index import StandardIndex
    from icclim._core.model.standard_variable import StandardVariable
    from icclim._core.model.threshold import Threshold


@dataclass

[docs]
class ClimateVariable:
    """
    ClimateVariable is a dataclass that represents a climate variable used to compute a climate index.

    It groups together the input variable (studied_data), its associated metadata
    (standard_var) if any, the threshold it must be compared to.

    Attributes
    ----------
    name: str
        Name of the variable.
    standard_var: StandardVariable
        CF metadata bounded to the standard variable used for this ClimateVariable.
    studied_data: DataArray
        The variable studied.
    threshold: Threshold | None
        thresholds for this variable
    """  # noqa: E501


[docs]
    name: str


[docs]
    standard_var: StandardVariable | None


[docs]
    studied_data: DataArray

    global_metadata: GlobalMetadata
    source_frequency: Frequency

[docs]
    threshold: Threshold | None = None

    is_reference: bool = False


[docs]
    def build_indicator_metadata(
        self,
        src_freq: Frequency,
        must_run_bootstrap: bool,
        jinja_scope: dict[str, Any],
        jinja_env: jinja2.Environment,
    ) -> dict[str, str | dict]:
        """
        Build the metadata for the indicator that will be computed with this variable.

        Parameters
        ----------
        src_freq: Frequency
            The frequency of the source data.
        must_run_bootstrap: bool
            Whether the bootstrap method must be run.
        jinja_scope: dict
            The scope to use for jinja templating.
        jinja_env: jinja2.Environment
            The environment to use for jinja templating.

        Returns
        -------
        dict of str, str | dict
            The metadata for the indicator.
        """
        metadata: dict[str, str | dict] = {"threshold": {}}
        if self.standard_var is None:
            metadata.update(
                {
                    "standard_name": "unknown_variable",
                    "long_name": "unknown variable",
                    "short_name": "input",
                },
            )
        else:
            metadata.update(self.standard_var.get_metadata())
        if self.threshold is not None:
            metadata.update(
                {
                    "threshold": self.threshold.format_metadata(
                        src_freq=src_freq,
                        must_run_bootstrap=must_run_bootstrap,
                        jinja_scope=jinja_scope,
                        jinja_env=jinja_env,
                    ),
                },
            )
        return metadata





[docs]
def build_climate_vars(
    climate_vars_dict: dict[str, InFileDictionary],
    ignore_Feb29th: bool,  # noqa: N803
    time_range: Sequence[datetime | str] | None,
    base_period: Sequence[str] | None,
    standard_index: StandardIndex | None,
    is_compared_to_reference: bool,
) -> list[ClimateVariable]:
    """
    Build a list of ClimateVariable from a dictionary of input files.

    Parameters
    ----------
    climate_vars_dict: dict of str, InFileDictionary
        The dictionary of input files.
    ignore_Feb29th: bool
        Whether to ignore February 29th.
    time_range: Sequence of datetime | str | None
        The time range to consider.
    base_period: Sequence of str | None
        The base period to consider, used to build a reference variable for indices such
         as anomaly.
    standard_index: StandardIndex | None
        The standard index to compute.

    Returns
    -------
    list of ClimateVariable that will be used to compute the climate index.
    """
    if standard_index is not None and len(standard_index.input_variables) > len(
        climate_vars_dict
    ):
        msg = (
            f"Index {standard_index.short_name} needs"
            f" {len(standard_index.input_variables)} variables."
            f" Please provide them with an xarray.Dataset, netCDF file(s) or a"
            f" zarr store."
        )
        raise InvalidIcclimArgumentError(msg)
    acc = []
    for i, raw_climate_var in enumerate(climate_vars_dict.items()):
        if standard_index is not None:
            standard_var = standard_index.input_variables[i]
        else:
            standard_var = None
        acc.append(
            build_climate_var(
                raw_climate_var[0],
                raw_climate_var[1],
                ignore_Feb29th,
                time_range,
                standard_var=standard_var,
            )
        )
    if _standard_index_needs_ref(
        standard_index,
        is_compared_to_reference,
    ) or _generic_index_needs_ref(standard_index, is_compared_to_reference):
        standard_var = (
            standard_index.input_variables[0] if standard_index is not None else None
        )
        added_var = _build_reference_variable(
            base_period,
            climate_vars_dict,
            standard_var=standard_var,
        )
        acc.append(added_var)
    return acc




[docs]
def build_climate_var(
    climate_var_name: str,
    climate_var_data: InFileDictionary | InFileBaseType,
    ignore_Feb29th: bool,  # noqa: N803
    time_range: Sequence[datetime | str] | None,
    standard_var: StandardVariable | None,
) -> ClimateVariable:
    """
    Build a ClimateVariable object.

    Parameters
    ----------
    climate_var_name : str
        The name of the climate variable.
    climate_var_data : InFileDictionary | InFileBaseType
        The input data for the climate variable. It can be either a dictionary
        or a file.
    ignore_Feb29th : bool
        Flag indicating whether to ignore February 29th in the time range.
    time_range : Sequence[datetime | str] | None
        The time range to consider for the climate variable. It can be a sequence
        of datetime objects or strings, or None to consider the entire time range.
    standard_var : StandardVariable | None
        The standard variable to use for the climate variable. If None, the input
        data will be used to guess the standard variable.

    Returns
    -------
    ClimateVariable
        The built ClimateVariable object.

    Notes
    -----
    This function builds a ClimateVariable object based on the provided inputs.
    It reads the input data, determines the standard variable, builds the studied
    data, and sets the threshold and global metadata.

    If the input data is a dictionary, it is assumed to have a 'study' key
    containing the study data and an optional 'thresholds' key containing the
    threshold data.

    If the input data is a file, it is assumed to contain the study data.

    The standard variable is used to determine the conversion unit for the
    threshold data.

    The studied data is built based on the study data, time range, ignore_Feb29th
    flag, and standard variable.

    If a threshold is provided in the dictionary, it is added to the ClimateVariable.

    Examples
    --------
    >>> climate_var_name = "tas"
    >>> climate_var_data = {"study": "/path/to/data.nc", "thresholds": ">= 27 degC"}
    >>> ignore_Feb29th = False
    >>> time_range = ["2000-01-01", "2010-12-31"]
    >>> standard_var = StandardVariableRegistry.TAS
    >>> climate_var = build_climate_var(
    ...     climate_var_name, climate_var_data, ignore_Feb29th, time_range, standard_var
    ... )
    """
    if isinstance(climate_var_data, dict):
        study_ds = read_dataset(
            climate_var_data["study"],
            standard_var,
            climate_var_name,
        )
        climate_var_thresh = climate_var_data.get("thresholds", None)
    else:
        study_ds = read_dataset(climate_var_data, standard_var, climate_var_name)
        climate_var_thresh = None
    if standard_var is None:
        standard_var = guess_standard_variable(study_ds[climate_var_name])
    studied_data = build_studied_data(
        study_ds[climate_var_name],
        time_range,
        ignore_Feb29th,
        standard_var.default_units if standard_var else None,
    )
    if climate_var_thresh is not None:
        climate_var_thresh = _build_threshold(
            climate_var_thresh=climate_var_thresh,
            original_data=study_ds[climate_var_name],
            conversion_unit=studied_data.attrs[UNITS_KEY],
        )
    return ClimateVariable(
        name=climate_var_name,
        standard_var=standard_var,
        studied_data=studied_data,
        threshold=climate_var_thresh,
        global_metadata={
            "history": study_ds.attrs.get("history", None),
            "source": study_ds.attrs.get("source", None),
            "time_encoding": study_ds.time.encoding,
        },
        source_frequency=FrequencyRegistry.lookup(
            xarray.infer_freq(studied_data.time) or DEFAULT_INPUT_FREQUENCY,
        ),
    )




[docs]
def must_run_bootstrap(da: DataArray, threshold: Threshold | None) -> bool:
    """
    Determine whether to run the bootstrap method.

    Parameters
    ----------
    da : DataArray
        The studied data.
    threshold : Threshold | None
        The threshold that contains the reference period.

    Returns
    -------
    bool
        Whether to run the bootstrap method.

    Notes
    -----
    This function is used to avoid bootstrapping if there is one single year
    overlapping or no year overlapping or all year overlapping between the studied
    data `da` and the reference period defined by the threshold.
    """
    # TODO @bzah: Don't run bootstrap when not on extreme percentile
    #       (run only below 20? 10? and above 80? 90?)
    # https://github.com/cerfacs-globc/icclim/issues/289
    if (
        threshold is None
        or not isinstance(threshold, PercentileThreshold)
        or (
            isinstance(threshold, PercentileThreshold)
            and not threshold.is_doy_per_threshold
        )
    ):
        return False
    reference = threshold.value
    study_years = np.unique(da.indexes.get("time").year)
    overlapping_years = np.unique(
        da.sel(time=_get_ref_period_slice(reference)).indexes.get("time").year,
    )
    return 1 < len(overlapping_years) < len(study_years)



def _standard_index_needs_ref(
    standard_index: StandardIndex, is_compared_to_reference: bool
) -> bool:
    return (
        standard_index
        and standard_index.qualifiers
        and REFERENCE_PERIOD_INDEX in standard_index.qualifiers
        and is_compared_to_reference
    )


def _generic_index_needs_ref(
    standard_index: StandardIndex, is_compared_to_reference: bool
) -> bool:
    return standard_index is None and is_compared_to_reference



[docs]
def _build_reference_variable(
    reference_period: Sequence[str] | None,
    in_files: dict[str, InFileDictionary],
    standard_var: StandardVariable,
) -> ClimateVariable:
    """
    Add a secondary variable for indices such as anomaly.

    This kind of indices require exactly two variables, but the second variable can
    just be a subset of the first one.
    """
    if reference_period is None:
        msg = "Can't build a reference variable without a `base_period_time_range`"
        raise InvalidIcclimArgumentError(msg)
    var_name = next(iter(in_files.keys()))
    if isinstance(in_files, dict):
        study_ds = read_dataset(
            next(iter(in_files.values()))["study"],
            standard_var=standard_var,
            var_name=var_name,
        )
    else:
        study_ds = read_dataset(
            next(iter(in_files.values())),
            standard_var=standard_var,
            var_name=var_name,
        )
    studied_data = build_reference_da(
        study_ds[var_name],
        reference_period,
        only_leap_years=False,
        percentile_min_value=None,
    )
    return ClimateVariable(
        name=var_name + "_reference",
        standard_var=standard_var,
        studied_data=studied_data,
        threshold=None,
        global_metadata={
            "history": study_ds.attrs.get("history", None),
            "source": study_ds.attrs.get("source", None),
            "time_encoding": study_ds.time.encoding,
        },
        source_frequency=FrequencyRegistry.lookup(
            xarray.infer_freq(studied_data.time) or DEFAULT_INPUT_FREQUENCY,
        ),
        is_reference=True,
    )



def _build_threshold(
    climate_var_thresh: str | Threshold,
    original_data: DataArray,
    conversion_unit: str,
) -> Threshold:
    if isinstance(climate_var_thresh, str):
        climate_var_thresh: Threshold = build_threshold(climate_var_thresh)
    if climate_var_thresh.prepare is not None and not climate_var_thresh.is_ready:
        climate_var_thresh.prepare(original_data)
    climate_var_thresh.unit = conversion_unit
    return climate_var_thresh


def _get_ref_period_slice(da: DataArray) -> slice:
    if (bds := da.attrs.get("climatology_bounds", None)) is not None:
        return slice(*bds)
    time_length = len(da.time)
    return slice(*da.time[0 :: time_length - 1].dt.strftime("%Y-%m-%d").to_numpy())