diff --git a/changelog/622.breaking.md b/changelog/622.breaking.md new file mode 100644 index 000000000..ba5c6dcc3 --- /dev/null +++ b/changelog/622.breaking.md @@ -0,0 +1,28 @@ +Renamed the "ignore datasets" configuration to "grey list" +and decoupled fetching from configuration loading. + +The `default_ignore_datasets.yaml` file at the repository root is now `config/default_grey_list.yaml`, +the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, +and the `REF_IGNORE_DATASETS_FILE` environment variable is now `REF_GREY_LIST_FILE`. + +A new `Config.grey_list_url` field (also overridable via `REF_GREY_LIST_URL`) controls where the grey list is fetched from. +Set it to an empty value to disable fetching, +which is useful for offline or air-gapped HPC environments. +The grey list location is now independent of the fetch lifecycle, +so users can pin it to a writable location (e.g. a Kubernetes volume mount) +and still have the solver refresh it. + +`Config.default()` no longer performs network I/O as a side effect. +A missing grey list file is treated as an empty list rather than raising an error, +so disabling fetching does not require pre-seeding the file. + +Loading a configuration that still uses the deprecated `ignore_datasets_file` key, +or running with the `REF_IGNORE_DATASETS_FILE` environment variable set, +now raises a hard error with migration instructions instead of silently falling back to defaults. + +Grey list refresh is now fail-safe: +a network failure with no cached file raises `GreyListRefreshError` +rather than creating an empty placeholder that would silently disable grey list protections for the next 6 hours. +If a cached file exists it is reused. + +See the new "Grey list" section in `docs/configuration.md` for full details. diff --git a/default_ignore_datasets.yaml b/config/default_grey_list.yaml similarity index 80% rename from default_ignore_datasets.yaml rename to config/default_grey_list.yaml index 441b131d5..396a007e1 100644 --- a/default_ignore_datasets.yaml +++ b/config/default_grey_list.yaml @@ -1,6 +1,6 @@ -# This file is used to specify datasets that should be ignored by default. +# Grey list: datasets that should be excluded from diagnostics by default. # -# It can be used to ignore datasets that are known to have issues and keep +# It can be used to skip datasets that are known to have issues and keep # diagnostics that use multiple datasets from running. # # The format used in this configuration file is: @@ -14,9 +14,10 @@ # instances that will be prepended to the constraints of the data requirements # of the diagnostics that the facets are listed under. # -# If no `ignore_datasets_file` is specified in the REF configuration, this file -# will be downloaded from GitHub and used. If the local copy of this file is -# older than 6 hours, it will be updated. +# `Config.grey_list_file` controls the location this file lives at and +# `Config.grey_list_url` controls where the solver refreshes it from +# (set the URL to None to disable fetching). The default cache is refreshed +# at most every `DEFAULT_GREY_LIST_MAX_AGE` (6 hours). # esmvaltool: sea-ice-sensitivity: diff --git a/docs/configuration.md b/docs/configuration.md index e2bed7a8f..7b0a3a56b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -95,6 +95,71 @@ If this is set, then the sample data won't be updated. Path where the test output is stored. This is used to store the output of the tests that are run in the test suite for later inspection. +## Grey list + +The *grey list* is a YAML file that lists facets which should be excluded from specific diagnostics +— for example, datasets that are known to crash or produce invalid output. +The datasets in the grey list are filtered before solving for the relevant diagnostic. + +The file format is: + +```yaml +provider: + diagnostic: + source_type: + - facet: value + - other_facet: [other_value1, other_value2] +``` + +Two configuration values control how the grey list is loaded; +both can be set in your `ref.toml` or via environment variables. + +### `grey_list_file` / `REF_GREY_LIST_FILE` + +Path to the grey list file on disk. +Defaults to `grey_list.yaml` inside your `REF_CONFIGURATION` directory +(alongside `ref.toml`, the database, etc.). +This location must be writable by the user +as the grey list may be updated periodically. + +### `grey_list_url` / `REF_GREY_LIST_URL` + +URL the solver fetches the grey list from. +Defaults to `config/default_grey_list.yaml` on the `main` branch +of the `Climate-REF/climate-ref` GitHub repository. +Override this to point at a fork or internal mirror. + +The download is **lazy and explicit**: +it only runs once at the start of a solve (`ExecutionSolver.build_from_db`), +and only when the on-disk copy is missing or older than 6 hours. +Read-only commands like `ref providers list` or `ref datasets list` never touch the network. + +If a refresh fails and no cached file exists, +the solver raises `GreyListRefreshError` rather than running with an empty grey list. +If a cached file exists it is reused. + +### Offline / air-gapped use (HPC) + +To run completely offline — for example on an HPC compute node with no outbound network — +set the URL to an empty value: + +```bash +export REF_GREY_LIST_URL= +``` + +or in `ref.toml`: + +```toml +grey_list_url = "" +``` + +When fetching is disabled the solver simply uses whatever file is at `grey_list_file`. +A missing file is treated as an empty grey list, +so you do not have to seed the file by hand. +If you want to apply a specific grey list, +either copy `config/default_grey_list.yaml` from the repository to your `grey_list_file` location ahead of time, +or fetch it once before disabling the URL on the compute nodes. + ## Configuration Options diff --git a/packages/climate-ref-core/src/climate_ref_core/providers.py b/packages/climate-ref-core/src/climate_ref_core/providers.py index dd62e70f4..281794d34 100644 --- a/packages/climate-ref-core/src/climate_ref_core/providers.py +++ b/packages/climate-ref-core/src/climate_ref_core/providers.py @@ -79,29 +79,33 @@ def configure(self, config: Any) -> None: config : A configuration. """ - logger.debug( - f"Configuring provider {self.slug} using ignore_datasets_file {config.ignore_datasets_file}" - ) - # The format of the configuration file is: + logger.debug(f"Configuring provider {self.slug} using grey_list_file {config.grey_list_file}") + # The format of the grey list file is: # provider: # diagnostic: # source_type: # - facet: value # - other_facet: [other_value1, other_value2] - ignore_datasets_all = yaml.safe_load(config.ignore_datasets_file.read_text(encoding="utf-8")) or {} - ignore_datasets = ignore_datasets_all.get(self.slug, {}) - if unknown_slugs := {slug for slug in ignore_datasets} - {d.slug for d in self.diagnostics()}: + # A missing file is treated as an empty grey list + # so offline/air-gapped users that disable fetching with `grey_list_url=""` + # can run without having to seed the file themselves. + if not config.grey_list_file.exists(): + grey_list_all: dict[str, Any] = {} + else: + grey_list_all = yaml.safe_load(config.grey_list_file.read_text(encoding="utf-8")) or {} + grey_list = grey_list_all.get(self.slug, {}) + if unknown_slugs := {slug for slug in grey_list} - {d.slug for d in self.diagnostics()}: logger.warning( - f"Unknown diagnostics found in {config.ignore_datasets_file} " + f"Unknown diagnostics found in {config.grey_list_file} " f"for provider {self.slug}: {', '.join(sorted(unknown_slugs))}" ) known_source_types = {s.value for s in iter(SourceDatasetType)} for diagnostic in self.diagnostics(): - if diagnostic.slug in ignore_datasets: - if unknown_source_types := set(ignore_datasets[diagnostic.slug]) - known_source_types: + if diagnostic.slug in grey_list: + if unknown_source_types := set(grey_list[diagnostic.slug]) - known_source_types: logger.warning( - f"Unknown source types found in {config.ignore_datasets_file} for " + f"Unknown source types found in {config.grey_list_file} for " f"diagnostic '{diagnostic.slug}' by provider {self.slug}: " f"{', '.join(sorted(unknown_source_types))}" ) @@ -114,7 +118,7 @@ def configure(self, config: Any) -> None: data_requirement, constraints=tuple( IgnoreFacets(facets) - for facets in ignore_datasets[diagnostic.slug].get( + for facets in grey_list[diagnostic.slug].get( data_requirement.source_type.value, [] ) ) diff --git a/packages/climate-ref-core/tests/unit/test_providers.py b/packages/climate-ref-core/tests/unit/test_providers.py index e7e57b9c7..bef179578 100644 --- a/packages/climate-ref-core/tests/unit/test_providers.py +++ b/packages/climate-ref-core/tests/unit/test_providers.py @@ -26,8 +26,8 @@ def mock_config(tmp_path, mocker): """Use a mock config to avoid depending on `climate_ref.config.Config`.""" config = mocker.Mock() config.paths.software = tmp_path / "software" - config.ignore_datasets_file = tmp_path / "ignore_datasets.yaml" - config.ignore_datasets_file.touch() + config.grey_list_file = tmp_path / "grey_list.yaml" + config.grey_list_file.touch() return config @@ -70,7 +70,7 @@ def test_provider_fixture(self, provider): assert isinstance(result, Diagnostic) def test_configure(self, provider, mock_config): - mock_config.ignore_datasets_file.write_text( + mock_config.grey_list_file.write_text( textwrap.dedent( """ mock_provider: @@ -85,8 +85,15 @@ def test_configure(self, provider, mock_config): expected_constraint = IgnoreFacets(facets={"source_id": ("A",)}) assert provider.diagnostics()[0].data_requirements[0][0].constraints[0] == expected_constraint + def test_configure_missing_grey_list_file(self, provider, mock_config): + # Offline/air-gapped users may run with grey_list_url="" and no file + # seeded yet; missing file should be treated as an empty grey list, + # not raise FileNotFoundError. + mock_config.grey_list_file.unlink() + provider.configure(mock_config) + def test_configure_unknown_diagnostic(self, provider, mock_config, caplog): - mock_config.ignore_datasets_file.write_text( + mock_config.grey_list_file.write_text( textwrap.dedent( """ mock_provider: @@ -100,13 +107,13 @@ def test_configure_unknown_diagnostic(self, provider, mock_config, caplog): with caplog.at_level(logging.WARNING): provider.configure(mock_config) expected_msg = ( - f"Unknown diagnostics found in {mock_config.ignore_datasets_file} " + f"Unknown diagnostics found in {mock_config.grey_list_file} " "for provider mock_provider: invalid_diagnostic" ) assert expected_msg in caplog.text def test_configure_unknown_source_type(self, provider, mock_config, caplog): - mock_config.ignore_datasets_file.write_text( + mock_config.grey_list_file.write_text( textwrap.dedent( """ mock_provider: @@ -120,7 +127,7 @@ def test_configure_unknown_source_type(self, provider, mock_config, caplog): with caplog.at_level(logging.WARNING): provider.configure(mock_config) expected_msg = ( - f"Unknown source types found in {mock_config.ignore_datasets_file} " + f"Unknown source types found in {mock_config.grey_list_file} " "for diagnostic 'mock' by provider mock_provider: invalid_source_type" ) assert expected_msg in caplog.text diff --git a/packages/climate-ref-pmp/tests/unit/test_provider.py b/packages/climate-ref-pmp/tests/unit/test_provider.py index bc692c23f..c9bcbf8d6 100644 --- a/packages/climate-ref-pmp/tests/unit/test_provider.py +++ b/packages/climate-ref-pmp/tests/unit/test_provider.py @@ -90,8 +90,8 @@ def test_configure_sets_env_vars(self, mocker, tmp_path): test_provider = PMPDiagnosticProvider("PMP-Test", "1.0") mock_config = mocker.Mock() mock_config.paths.software = tmp_path / "software" - mock_config.ignore_datasets_file = tmp_path / "ignore.yaml" - mock_config.ignore_datasets_file.touch() + mock_config.grey_list_file = tmp_path / "ignore.yaml" + mock_config.grey_list_file.touch() mocker.patch.object(test_provider, "get_conda_exe", return_value=Path("/path/to/conda")) diff --git a/packages/climate-ref/src/climate_ref/config.py b/packages/climate-ref/src/climate_ref/config.py index 4ce5cc641..7788f51b4 100644 --- a/packages/climate-ref/src/climate_ref/config.py +++ b/packages/climate-ref/src/climate_ref/config.py @@ -20,7 +20,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -import platformdirs import requests import tomlkit from attr import Factory @@ -328,6 +327,30 @@ def default_providers() -> list[DiagnosticProviderConfig]: ] +# TODO(1.0.0): drop the deprecated ignore_datasets_file / REF_IGNORE_DATASETS_FILE +# migration shim below (constants, _check_deprecated_grey_list_config, and the +# associated tests). +_DEPRECATED_IGNORE_DATASETS_KEY = "ignore_datasets_file" +_DEPRECATED_IGNORE_DATASETS_ENV = f"{env_prefix}_IGNORE_DATASETS_FILE" +_GREY_LIST_MIGRATION_HINT = ( + "`ignore_datasets_file` / `REF_IGNORE_DATASETS_FILE` has been renamed to " + "`grey_list_file` / `REF_GREY_LIST_FILE`. Update your configuration accordingly. " + "Set `REF_GREY_LIST_URL=` (empty) to disable automatic refresh if you previously " + "managed the file manually." +) + + +def _check_deprecated_grey_list_config(config_file: str | Path, doc: dict[str, Any]) -> None: + """Hard-fail when deprecated grey list configuration is detected.""" + problems: list[str] = [] + if _DEPRECATED_IGNORE_DATASETS_KEY in doc: + problems.append(f"`{_DEPRECATED_IGNORE_DATASETS_KEY}` found in configuration file {config_file}") + if os.environ.get(_DEPRECATED_IGNORE_DATASETS_ENV) is not None: + problems.append(f"`{_DEPRECATED_IGNORE_DATASETS_ENV}` environment variable is set") + if problems: + raise ValueError("; ".join(problems) + ". " + _GREY_LIST_MIGRATION_HINT) + + def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config": # Try loading the configuration with strict validation try: @@ -342,44 +365,85 @@ def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config": return _converter_defaults_relaxed.structure(doc, Config) -DEFAULT_IGNORE_DATASETS_MAX_AGE = datetime.timedelta(hours=6) -DEFAULT_IGNORE_DATASETS_URL = ( - "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/default_ignore_datasets.yaml" +class GreyListRefreshError(RuntimeError): + """Raised when the grey list cannot be refreshed and no cached copy is available.""" + + +DEFAULT_GREY_LIST_MAX_AGE = datetime.timedelta(hours=6) +DEFAULT_GREY_LIST_URL = ( + "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/config/default_grey_list.yaml" ) -def _get_default_ignore_datasets_file() -> Path: +def _default_grey_list_path() -> Path: + """Return the default location of the grey list file (no I/O, no network). + + Lives under `REF_CONFIGURATION` (alongside `ref.toml`, the database, etc.) + so it inherits the same writable directory the user already has set up, + rather than the user cache directory + which is often read-only on container/HPC deployments. """ - Get the path to the ignore datasets file + return env.path("REF_CONFIGURATION").resolve() / "grey_list.yaml" + + +def refresh_grey_list_file( + path: Path, + url: str, + *, + max_age: datetime.timedelta = DEFAULT_GREY_LIST_MAX_AGE, +) -> None: """ - cache_dir = platformdirs.user_cache_path("climate_ref") - cache_dir.mkdir(parents=True, exist_ok=True) - ignore_datasets_file = cache_dir / "default_ignore_datasets.yaml" + Download the grey list file to ``path`` from ``url`` if it is missing or stale. + + If the file already exists and was modified within ``max_age``, + no network request is made. + + On network failure: + if a cached file already exists at ``path`` it is left untouched + (and used as the last-known-good copy). + If no cached file exists a `GreyListRefreshError` is raised + so the solver fails loudly rather than silently running without grey list protections. - download = True - if ignore_datasets_file.exists(): - # Only update if the ignore datasets file is older than `DEFAULT_IGNORE_DATASETS_MAX_AGE`. - modification_time = datetime.datetime.fromtimestamp(ignore_datasets_file.stat().st_mtime) + Parameters + ---------- + path + Destination path for the grey list file. + Parent directories are created as needed + so that user-supplied paths (e.g. a writable k8s volume) work. + url + URL to fetch the grey list from. + max_age + Maximum age of an existing file before it is considered stale. + + Raises + ------ + GreyListRefreshError + If the download fails and no cached file exists at ``path``. + """ + path.parent.mkdir(parents=True, exist_ok=True) + + if path.exists(): + modification_time = datetime.datetime.fromtimestamp(path.stat().st_mtime) age = datetime.datetime.now() - modification_time - if age < DEFAULT_IGNORE_DATASETS_MAX_AGE: - download = False - - if download: - logger.info( - f"Downloading default ignore datasets file from {DEFAULT_IGNORE_DATASETS_URL} " - f"to {ignore_datasets_file}" - ) - try: - response = requests.get(DEFAULT_IGNORE_DATASETS_URL, timeout=120) - response.raise_for_status() - except requests.RequestException as exc: - logger.warning(f"Failed to download default ignore datasets file: {exc}") - ignore_datasets_file.touch(exist_ok=True) - else: - with ignore_datasets_file.open(mode="wb") as file: - file.write(response.content) + if age < max_age: + return - return ignore_datasets_file + logger.info(f"Downloading grey list from {url} to {path}") + try: + response = requests.get(url, timeout=120) + response.raise_for_status() + except requests.RequestException as exc: + if path.exists(): + logger.warning(f"Failed to refresh grey list from {url}; using cached copy at {path}: {exc}") + return + raise GreyListRefreshError( + f"Failed to download grey list from {url} and no cached copy exists at {path}. " + "Set `REF_GREY_LIST_URL=` (empty) to disable automatic refresh and use a " + f"manually managed file. Original error: {exc}" + ) from exc + else: + with path.open(mode="wb") as file: + file.write(response.content) @define(auto_attribs=True) @@ -424,11 +488,15 @@ class Config: - `complete`: Use the complete parser, which parses the dataset based on all available metadata. """ - ignore_datasets_file: Path = field(factory=_get_default_ignore_datasets_file) + grey_list_file: Path = env_field( # noqa: RUF009 + "GREY_LIST_FILE", + factory=_default_grey_list_path, + converter=Path, + ) """ - Path to the file containing the ignore datasets + Path to the grey list file. - This file is a YAML file that contains a list of facets to ignore per diagnostic. + The grey list is a YAML file that lists facets to exclude per diagnostic. The format is: ```yaml @@ -439,9 +507,18 @@ class Config: - another_facet: [another_value1, another_value2] ``` - If this is not specified, a default ignore datasets file will be used. - The default file is downloaded from the Climate-REF GitHub repository - if it does not exist or is older than 6 hours. + Defaults to a path under the configuration directory. + This location must be writable + as the solver may refresh the file from `grey_list_url` if it is missing or stale. + """ + + grey_list_url: str = env_field("GREY_LIST_URL", default=DEFAULT_GREY_LIST_URL) + """ + URL to fetch the grey list from at solve time. + + Set to an empty string (e.g. `REF_GREY_LIST_URL=`) to disable fetching entirely; + in that case the solver will use whatever file already exists at `grey_list_file`. + Override the URL to point at a fork or mirror. """ paths: PathConfig = Factory(PathConfig) @@ -478,6 +555,8 @@ def load(cls, config_file: Path, allow_missing: bool = True) -> "Config": doc = TOMLDocument() raw = None + _check_deprecated_grey_list_config(config_file, doc) + try: config = _load_config(config_file, doc) except Exception as exc: diff --git a/packages/climate-ref/src/climate_ref/conftest_plugin.py b/packages/climate-ref/src/climate_ref/conftest_plugin.py index 2e97677e8..1d94ad144 100644 --- a/packages/climate-ref/src/climate_ref/conftest_plugin.py +++ b/packages/climate-ref/src/climate_ref/conftest_plugin.py @@ -251,12 +251,13 @@ def data_catalog( @pytest.fixture(scope="session") def solve_config() -> Config: - """Session-scoped Config that uses the local default_ignore_datasets.yaml""" + """Session-scoped Config that uses the in-tree grey list and never fetches.""" cfg = Config.default() - local_ignore_file = Path(__file__).parents[4] / "default_ignore_datasets.yaml" - if not local_ignore_file.is_file(): - raise ValueError(f"Could not find ignore file at {local_ignore_file}") - cfg.ignore_datasets_file = local_ignore_file + local_grey_list = Path(__file__).parents[4] / "config" / "default_grey_list.yaml" + if not local_grey_list.is_file(): + raise ValueError(f"Could not find grey list file at {local_grey_list}") + cfg.grey_list_file = local_grey_list + cfg.grey_list_url = "" return cfg @@ -274,6 +275,10 @@ def config(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, request: pytest.Fixt cfg.paths.software = software_path cfg.diagnostic_providers = [DiagnosticProviderConfig(provider="climate_ref_example")] cfg.executor.executor = "climate_ref.executor.SynchronousExecutor" + # Isolate tests from the user's grey list cache and disable fetching. + cfg.grey_list_file = tmp_path / "grey_list.yaml" + cfg.grey_list_file.touch() + cfg.grey_list_url = "" cfg.save() return cfg diff --git a/packages/climate-ref/src/climate_ref/solver.py b/packages/climate-ref/src/climate_ref/solver.py index 74f4eeb53..c3e832974 100644 --- a/packages/climate-ref/src/climate_ref/solver.py +++ b/packages/climate-ref/src/climate_ref/solver.py @@ -13,7 +13,7 @@ from attrs import define, frozen from loguru import logger -from climate_ref.config import Config +from climate_ref.config import Config, refresh_grey_list_file from climate_ref.data_catalog import DataCatalog from climate_ref.database import Database from climate_ref.datasets import ( @@ -470,6 +470,9 @@ def build_from_db(config: Config, db: Database) -> "ExecutionSolver": : A new ExecutionSolver instance """ + if config.grey_list_url: + refresh_grey_list_file(config.grey_list_file, config.grey_list_url) + return ExecutionSolver( provider_registry=ProviderRegistry.build_from_config(config, db), data_catalog={ diff --git a/packages/climate-ref/tests/unit/test_config.py b/packages/climate-ref/tests/unit/test_config.py index aeb7abf4b..aec872723 100644 --- a/packages/climate-ref/tests/unit/test_config.py +++ b/packages/climate-ref/tests/unit/test_config.py @@ -4,7 +4,6 @@ from datetime import timedelta from pathlib import Path -import platformdirs import pytest import requests from attr import evolve @@ -12,10 +11,12 @@ import climate_ref.config from climate_ref.config import ( + DEFAULT_GREY_LIST_URL, DEFAULT_LOG_FORMAT, Config, + GreyListRefreshError, PathConfig, - _get_default_ignore_datasets_file, + refresh_grey_list_file, transform_error, ) from climate_ref_core.exceptions import InvalidExecutorException @@ -153,9 +154,8 @@ def test_defaults(self, monkeypatch, mocker): without_defaults = cfg.dump(defaults=False) assert without_defaults == { - "ignore_datasets_file": str( - platformdirs.user_cache_path("climate_ref") / "default_ignore_datasets.yaml" - ), + "grey_list_file": f"{default_path}/grey_list.yaml", + "grey_list_url": DEFAULT_GREY_LIST_URL, "log_level": "INFO", "log_format": DEFAULT_LOG_FORMAT, "cmip6_parser": "complete", @@ -165,9 +165,8 @@ def test_defaults(self, monkeypatch, mocker): ], } assert with_defaults == { - "ignore_datasets_file": str( - platformdirs.user_cache_path("climate_ref") / "default_ignore_datasets.yaml" - ), + "grey_list_file": f"{default_path}/grey_list.yaml", + "grey_list_url": DEFAULT_GREY_LIST_URL, "log_level": "INFO", "log_format": DEFAULT_LOG_FORMAT, "cmip6_parser": "complete", @@ -200,6 +199,8 @@ def test_from_env_variables(self, monkeypatch, config): monkeypatch.setenv("REF_LOG_ROOT", "/my/test/logs") monkeypatch.setenv("REF_RESULTS_ROOT", "/my/test/executions") monkeypatch.setenv("REF_CMIP6_PARSER", "drs") + monkeypatch.setenv("REF_GREY_LIST_FILE", "/my/test/grey_list.yaml") + monkeypatch.setenv("REF_GREY_LIST_URL", "") config_new = config.refresh() @@ -209,6 +210,11 @@ def test_from_env_variables(self, monkeypatch, config): assert config_new.paths.log == Path("/my/test/logs") assert config_new.paths.results == Path("/my/test/executions") assert config_new.cmip6_parser == "drs" + # Env overrides for grey_list_file must be coerced to Path so callers + # can use .read_text(), .exists(), etc. + assert config_new.grey_list_file == Path("/my/test/grey_list.yaml") + assert isinstance(config_new.grey_list_file, Path) + assert config_new.grey_list_url == "" def test_custom_env_variable(self, monkeypatch, tmp_path, config): monkeypatch.setenv("ABC", "/my") @@ -270,51 +276,92 @@ def test_transform_error(): @pytest.mark.parametrize("status", ["fresh", "stale", "missing"]) -def test_get_default_ignore_datasets_file(mocker, tmp_path, status): - mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path) +def test_refresh_grey_list_file(mocker, tmp_path, status): mocker.patch.object( climate_ref.config.requests, "get", return_value=mocker.MagicMock(status_code=200, content=b"downloaded"), ) - expected_path = tmp_path / "default_ignore_datasets.yaml" + target = tmp_path / "nested" / "grey_list.yaml" if status != "missing": - expected_path.write_text("existing", encoding="utf-8") - if status == "stale": - mocker.patch.object(climate_ref.config, "DEFAULT_IGNORE_DATASETS_MAX_AGE", timedelta(seconds=-1)) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("existing", encoding="utf-8") + max_age = timedelta(seconds=-1) if status == "stale" else timedelta(hours=6) - path = climate_ref.config._get_default_ignore_datasets_file() + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml", max_age=max_age) - assert path == tmp_path / "default_ignore_datasets.yaml" + assert target.parent.exists() if status == "fresh": - assert path.read_text(encoding="utf-8") == "existing" + assert target.read_text(encoding="utf-8") == "existing" else: - assert path.read_text(encoding="utf-8") == "downloaded" + assert target.read_text(encoding="utf-8") == "downloaded" -def test_get_default_ignore_datasets_file_fail(mocker, tmp_path): - mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path) +def test_refresh_grey_list_file_fail_no_cache(mocker, tmp_path): + """A download failure with no cached copy must raise rather than create an empty placeholder.""" result = mocker.MagicMock(status_code=404, content=b"{}") result.raise_for_status.side_effect = requests.RequestException mocker.patch.object(climate_ref.config.requests, "get", return_value=result) - path = _get_default_ignore_datasets_file() - assert path == tmp_path / "default_ignore_datasets.yaml" - assert path.parent.exists() - assert path.read_text(encoding="utf-8") == "" + target = tmp_path / "grey_list.yaml" + with pytest.raises(GreyListRefreshError, match="no cached copy"): + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") + + assert not target.exists() -def test_get_default_ignore_datasets_file_network_error(mocker, tmp_path): - """Test that network errors during requests.get() are handled gracefully.""" - mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path) - # Simulate network error (e.g., no network access in offline/HPC environment) +def test_refresh_grey_list_file_network_error_no_cache(mocker, tmp_path): + """Network errors with no cached copy must raise (fail-safe).""" mocker.patch.object( climate_ref.config.requests, "get", side_effect=requests.exceptions.ConnectionError("Network unreachable"), ) - path = _get_default_ignore_datasets_file() - assert path == tmp_path / "default_ignore_datasets.yaml" - assert path.parent.exists() - assert path.read_text(encoding="utf-8") == "" + target = tmp_path / "grey_list.yaml" + with pytest.raises(GreyListRefreshError): + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") + + assert not target.exists() + + +def test_refresh_grey_list_file_fail_uses_stale_cache(mocker, tmp_path): + """A download failure must preserve and reuse an existing cached copy.""" + result = mocker.MagicMock(status_code=500, content=b"{}") + result.raise_for_status.side_effect = requests.RequestException + mocker.patch.object(climate_ref.config.requests, "get", return_value=result) + + target = tmp_path / "grey_list.yaml" + target.write_text("cached", encoding="utf-8") + + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml", max_age=timedelta(seconds=-1)) + + assert target.read_text(encoding="utf-8") == "cached" + + +def test_config_default_does_not_fetch(mocker): + """`Config.default()` must not perform any network I/O.""" + get_mock = mocker.patch.object(climate_ref.config.requests, "get") + + Config.default() + + get_mock.assert_not_called() + + +def test_load_rejects_deprecated_ignore_datasets_file(tmp_path, monkeypatch): + """Old `ignore_datasets_file` toml key must hard-fail with a migration hint.""" + monkeypatch.delenv("REF_IGNORE_DATASETS_FILE", raising=False) + config_file = tmp_path / "ref.toml" + config_file.write_text('ignore_datasets_file = "old.yaml"\n', encoding="utf-8") + + with pytest.raises(ValueError, match="ignore_datasets_file"): + Config.load(config_file) + + +def test_load_rejects_deprecated_ignore_datasets_env(tmp_path, monkeypatch): + """`REF_IGNORE_DATASETS_FILE` env var must hard-fail with a migration hint.""" + monkeypatch.setenv("REF_IGNORE_DATASETS_FILE", str(tmp_path / "old.yaml")) + config_file = tmp_path / "ref.toml" + + with pytest.raises(ValueError, match="REF_IGNORE_DATASETS_FILE"): + Config.load(config_file) diff --git a/packages/climate-ref/tests/unit/test_provider_registry.py b/packages/climate-ref/tests/unit/test_provider_registry.py index a48db081d..9589de397 100644 --- a/packages/climate-ref/tests/unit/test_provider_registry.py +++ b/packages/climate-ref/tests/unit/test_provider_registry.py @@ -17,6 +17,17 @@ def test_create(self, config, mocker): assert mock_import.call_count == 1 mock_register.assert_called_once_with(db, mock_import.return_value) + def test_build_from_config_does_not_fetch_grey_list(self, config, mocker): + """Read-only commands (providers/datasets list) must not hit the network.""" + db = mocker.MagicMock() + mocker.patch("climate_ref.provider_registry.import_provider") + mocker.patch("climate_ref.provider_registry._register_provider") + get_mock = mocker.patch("climate_ref.config.requests.get") + + ProviderRegistry.build_from_config(config, db) + + get_mock.assert_not_called() + def test_get_provider_found(self, mocker): """Test get() returns provider when found.""" mock_provider = mocker.MagicMock() diff --git a/packages/climate-ref/tests/unit/test_solver.py b/packages/climate-ref/tests/unit/test_solver.py index 96c2709a0..39c5621f8 100644 --- a/packages/climate-ref/tests/unit/test_solver.py +++ b/packages/climate-ref/tests/unit/test_solver.py @@ -83,6 +83,22 @@ def test_solver_build_from_db(self, solver): assert isinstance(solver.data_catalog[SourceDatasetType.CMIP6], DataCatalog) assert len(solver.data_catalog[SourceDatasetType.CMIP6].to_frame()) + def test_build_from_db_refreshes_grey_list(self, db_seeded, config, mocker): + refresh = mocker.patch("climate_ref.solver.refresh_grey_list_file") + config.grey_list_url = "https://example.invalid/grey_list.yaml" + + ExecutionSolver.build_from_db(config, db_seeded) + + refresh.assert_called_once_with(config.grey_list_file, config.grey_list_url) + + def test_build_from_db_skips_refresh_when_url_empty(self, db_seeded, config, mocker): + refresh = mocker.patch("climate_ref.solver.refresh_grey_list_file") + config.grey_list_url = "" + + ExecutionSolver.build_from_db(config, db_seeded) + + refresh.assert_not_called() + class TestExtractCoveredDatasetsWithDataCatalog: """Test that extract_covered_datasets triggers finalisation when given a DataCatalog."""