From ecaf2420b371f93a14a2c933da6fc990e9a4a78d Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sun, 12 Apr 2026 20:46:32 +1000 Subject: [PATCH 1/7] refactor(config)!: rename ignore datasets to grey list and decouple fetching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The grey list is now described by two independent fields: - `Config.grey_list_file` controls where the file lives (override e.g. for k8s pods where the user cache is read-only). - `Config.grey_list_url` controls where it is fetched from. Set to "" to disable fetching for offline/air-gapped environments. `Config.default()` no longer performs network I/O as a side effect; the download moves to an explicit `refresh_grey_list_file` call inside `ProviderRegistry.build_from_config`. This also fixes #590 — solver regression tests now read the in-tree `default_grey_list.yaml` instead of silently picking up the cached main-branch copy. BREAKING CHANGE: `default_ignore_datasets.yaml`, `Config.ignore_datasets_file`, and `REF_IGNORE_DATASETS_*` env vars are renamed to their `grey_list` equivalents. --- changelog/590.breaking.md | 1 + ...re_datasets.yaml => default_grey_list.yaml | 11 ++- .../src/climate_ref_core/providers.py | 22 ++--- .../tests/unit/test_providers.py | 14 +-- .../tests/unit/test_provider.py | 4 +- .../climate-ref/src/climate_ref/config.py | 96 ++++++++++++------- .../src/climate_ref/conftest_plugin.py | 15 ++- .../src/climate_ref/provider_registry.py | 5 +- .../climate-ref/tests/unit/test_config.py | 68 +++++++------ 9 files changed, 137 insertions(+), 99 deletions(-) create mode 100644 changelog/590.breaking.md rename default_ignore_datasets.yaml => default_grey_list.yaml (80%) diff --git a/changelog/590.breaking.md b/changelog/590.breaking.md new file mode 100644 index 000000000..689671594 --- /dev/null +++ b/changelog/590.breaking.md @@ -0,0 +1 @@ +Renamed the "ignore datasets" configuration to "grey list" and decoupled fetching from configuration loading. The `default_ignore_datasets.yaml` file is now `default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_*` environment variables are now `REF_GREY_LIST_*`. A new `Config.grey_list_url` field controls where the grey list is fetched from at solve time; set it (or the `REF_GREY_LIST_URL` environment variable) to an empty value to disable fetching, which is useful for offline or air-gapped environments. The grey list location is now independent of the fetch lifecycle so users can pin it to a writable location (e.g. a Kubernetes volume mount) and still have the solver refresh it. `Config.default()` no longer performs network I/O as a side effect; the download now happens explicitly during solve setup. diff --git a/default_ignore_datasets.yaml b/default_grey_list.yaml similarity index 80% rename from default_ignore_datasets.yaml rename to default_grey_list.yaml index 441b131d5..396a007e1 100644 --- a/default_ignore_datasets.yaml +++ b/default_grey_list.yaml @@ -1,6 +1,6 @@ -# This file is used to specify datasets that should be ignored by default. +# Grey list: datasets that should be excluded from diagnostics by default. # -# It can be used to ignore datasets that are known to have issues and keep +# It can be used to skip datasets that are known to have issues and keep # diagnostics that use multiple datasets from running. # # The format used in this configuration file is: @@ -14,9 +14,10 @@ # instances that will be prepended to the constraints of the data requirements # of the diagnostics that the facets are listed under. # -# If no `ignore_datasets_file` is specified in the REF configuration, this file -# will be downloaded from GitHub and used. If the local copy of this file is -# older than 6 hours, it will be updated. +# `Config.grey_list_file` controls the location this file lives at and +# `Config.grey_list_url` controls where the solver refreshes it from +# (set the URL to None to disable fetching). The default cache is refreshed +# at most every `DEFAULT_GREY_LIST_MAX_AGE` (6 hours). # esmvaltool: sea-ice-sensitivity: diff --git a/packages/climate-ref-core/src/climate_ref_core/providers.py b/packages/climate-ref-core/src/climate_ref_core/providers.py index dd62e70f4..7e991c1cc 100644 --- a/packages/climate-ref-core/src/climate_ref_core/providers.py +++ b/packages/climate-ref-core/src/climate_ref_core/providers.py @@ -79,29 +79,27 @@ def configure(self, config: Any) -> None: config : A configuration. """ - logger.debug( - f"Configuring provider {self.slug} using ignore_datasets_file {config.ignore_datasets_file}" - ) - # The format of the configuration file is: + logger.debug(f"Configuring provider {self.slug} using grey_list_file {config.grey_list_file}") + # The format of the grey list file is: # provider: # diagnostic: # source_type: # - facet: value # - other_facet: [other_value1, other_value2] - ignore_datasets_all = yaml.safe_load(config.ignore_datasets_file.read_text(encoding="utf-8")) or {} - ignore_datasets = ignore_datasets_all.get(self.slug, {}) - if unknown_slugs := {slug for slug in ignore_datasets} - {d.slug for d in self.diagnostics()}: + grey_list_all = yaml.safe_load(config.grey_list_file.read_text(encoding="utf-8")) or {} + grey_list = grey_list_all.get(self.slug, {}) + if unknown_slugs := {slug for slug in grey_list} - {d.slug for d in self.diagnostics()}: logger.warning( - f"Unknown diagnostics found in {config.ignore_datasets_file} " + f"Unknown diagnostics found in {config.grey_list_file} " f"for provider {self.slug}: {', '.join(sorted(unknown_slugs))}" ) known_source_types = {s.value for s in iter(SourceDatasetType)} for diagnostic in self.diagnostics(): - if diagnostic.slug in ignore_datasets: - if unknown_source_types := set(ignore_datasets[diagnostic.slug]) - known_source_types: + if diagnostic.slug in grey_list: + if unknown_source_types := set(grey_list[diagnostic.slug]) - known_source_types: logger.warning( - f"Unknown source types found in {config.ignore_datasets_file} for " + f"Unknown source types found in {config.grey_list_file} for " f"diagnostic '{diagnostic.slug}' by provider {self.slug}: " f"{', '.join(sorted(unknown_source_types))}" ) @@ -114,7 +112,7 @@ def configure(self, config: Any) -> None: data_requirement, constraints=tuple( IgnoreFacets(facets) - for facets in ignore_datasets[diagnostic.slug].get( + for facets in grey_list[diagnostic.slug].get( data_requirement.source_type.value, [] ) ) diff --git a/packages/climate-ref-core/tests/unit/test_providers.py b/packages/climate-ref-core/tests/unit/test_providers.py index e7e57b9c7..7873ec0b5 100644 --- a/packages/climate-ref-core/tests/unit/test_providers.py +++ b/packages/climate-ref-core/tests/unit/test_providers.py @@ -26,8 +26,8 @@ def mock_config(tmp_path, mocker): """Use a mock config to avoid depending on `climate_ref.config.Config`.""" config = mocker.Mock() config.paths.software = tmp_path / "software" - config.ignore_datasets_file = tmp_path / "ignore_datasets.yaml" - config.ignore_datasets_file.touch() + config.grey_list_file = tmp_path / "grey_list.yaml" + config.grey_list_file.touch() return config @@ -70,7 +70,7 @@ def test_provider_fixture(self, provider): assert isinstance(result, Diagnostic) def test_configure(self, provider, mock_config): - mock_config.ignore_datasets_file.write_text( + mock_config.grey_list_file.write_text( textwrap.dedent( """ mock_provider: @@ -86,7 +86,7 @@ def test_configure(self, provider, mock_config): assert provider.diagnostics()[0].data_requirements[0][0].constraints[0] == expected_constraint def test_configure_unknown_diagnostic(self, provider, mock_config, caplog): - mock_config.ignore_datasets_file.write_text( + mock_config.grey_list_file.write_text( textwrap.dedent( """ mock_provider: @@ -100,13 +100,13 @@ def test_configure_unknown_diagnostic(self, provider, mock_config, caplog): with caplog.at_level(logging.WARNING): provider.configure(mock_config) expected_msg = ( - f"Unknown diagnostics found in {mock_config.ignore_datasets_file} " + f"Unknown diagnostics found in {mock_config.grey_list_file} " "for provider mock_provider: invalid_diagnostic" ) assert expected_msg in caplog.text def test_configure_unknown_source_type(self, provider, mock_config, caplog): - mock_config.ignore_datasets_file.write_text( + mock_config.grey_list_file.write_text( textwrap.dedent( """ mock_provider: @@ -120,7 +120,7 @@ def test_configure_unknown_source_type(self, provider, mock_config, caplog): with caplog.at_level(logging.WARNING): provider.configure(mock_config) expected_msg = ( - f"Unknown source types found in {mock_config.ignore_datasets_file} " + f"Unknown source types found in {mock_config.grey_list_file} " "for diagnostic 'mock' by provider mock_provider: invalid_source_type" ) assert expected_msg in caplog.text diff --git a/packages/climate-ref-pmp/tests/unit/test_provider.py b/packages/climate-ref-pmp/tests/unit/test_provider.py index bc692c23f..c9bcbf8d6 100644 --- a/packages/climate-ref-pmp/tests/unit/test_provider.py +++ b/packages/climate-ref-pmp/tests/unit/test_provider.py @@ -90,8 +90,8 @@ def test_configure_sets_env_vars(self, mocker, tmp_path): test_provider = PMPDiagnosticProvider("PMP-Test", "1.0") mock_config = mocker.Mock() mock_config.paths.software = tmp_path / "software" - mock_config.ignore_datasets_file = tmp_path / "ignore.yaml" - mock_config.ignore_datasets_file.touch() + mock_config.grey_list_file = tmp_path / "ignore.yaml" + mock_config.grey_list_file.touch() mocker.patch.object(test_provider, "get_conda_exe", return_value=Path("/path/to/conda")) diff --git a/packages/climate-ref/src/climate_ref/config.py b/packages/climate-ref/src/climate_ref/config.py index 4ce5cc641..4ca21d56b 100644 --- a/packages/climate-ref/src/climate_ref/config.py +++ b/packages/climate-ref/src/climate_ref/config.py @@ -342,44 +342,58 @@ def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config": return _converter_defaults_relaxed.structure(doc, Config) -DEFAULT_IGNORE_DATASETS_MAX_AGE = datetime.timedelta(hours=6) -DEFAULT_IGNORE_DATASETS_URL = ( - "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/default_ignore_datasets.yaml" +DEFAULT_GREY_LIST_MAX_AGE = datetime.timedelta(hours=6) +DEFAULT_GREY_LIST_URL = ( + "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/default_grey_list.yaml" ) -def _get_default_ignore_datasets_file() -> Path: +def _default_grey_list_path() -> Path: + """Return the default location of the grey list file (no I/O, no network).""" + return platformdirs.user_cache_path("climate_ref") / "default_grey_list.yaml" + + +def refresh_grey_list_file( + path: Path, + url: str, + *, + max_age: datetime.timedelta = DEFAULT_GREY_LIST_MAX_AGE, +) -> None: """ - Get the path to the ignore datasets file + Download the grey list file to ``path`` from ``url`` if it is missing or stale. + + If the file already exists and was modified within ``max_age``, no network + request is made. On network failure an empty file is created so that + downstream YAML parsing does not blow up; an existing file is left untouched. + + Parameters + ---------- + path + Destination path for the grey list file. Parent directories are created + as needed so that user-supplied paths (e.g. a writable k8s volume) work. + url + URL to fetch the grey list from. + max_age + Maximum age of an existing file before it is considered stale. """ - cache_dir = platformdirs.user_cache_path("climate_ref") - cache_dir.mkdir(parents=True, exist_ok=True) - ignore_datasets_file = cache_dir / "default_ignore_datasets.yaml" + path.parent.mkdir(parents=True, exist_ok=True) - download = True - if ignore_datasets_file.exists(): - # Only update if the ignore datasets file is older than `DEFAULT_IGNORE_DATASETS_MAX_AGE`. - modification_time = datetime.datetime.fromtimestamp(ignore_datasets_file.stat().st_mtime) + if path.exists(): + modification_time = datetime.datetime.fromtimestamp(path.stat().st_mtime) age = datetime.datetime.now() - modification_time - if age < DEFAULT_IGNORE_DATASETS_MAX_AGE: - download = False - - if download: - logger.info( - f"Downloading default ignore datasets file from {DEFAULT_IGNORE_DATASETS_URL} " - f"to {ignore_datasets_file}" - ) - try: - response = requests.get(DEFAULT_IGNORE_DATASETS_URL, timeout=120) - response.raise_for_status() - except requests.RequestException as exc: - logger.warning(f"Failed to download default ignore datasets file: {exc}") - ignore_datasets_file.touch(exist_ok=True) - else: - with ignore_datasets_file.open(mode="wb") as file: - file.write(response.content) + if age < max_age: + return - return ignore_datasets_file + logger.info(f"Downloading grey list from {url} to {path}") + try: + response = requests.get(url, timeout=120) + response.raise_for_status() + except requests.RequestException as exc: + logger.warning(f"Failed to download grey list: {exc}") + path.touch(exist_ok=True) + else: + with path.open(mode="wb") as file: + file.write(response.content) @define(auto_attribs=True) @@ -424,11 +438,11 @@ class Config: - `complete`: Use the complete parser, which parses the dataset based on all available metadata. """ - ignore_datasets_file: Path = field(factory=_get_default_ignore_datasets_file) + grey_list_file: Path = env_field("GREY_LIST_FILE", factory=_default_grey_list_path) # noqa: RUF009 """ - Path to the file containing the ignore datasets + Path to the grey list file. - This file is a YAML file that contains a list of facets to ignore per diagnostic. + The grey list is a YAML file that lists facets to exclude per diagnostic. The format is: ```yaml @@ -439,9 +453,19 @@ class Config: - another_facet: [another_value1, another_value2] ``` - If this is not specified, a default ignore datasets file will be used. - The default file is downloaded from the Climate-REF GitHub repository - if it does not exist or is older than 6 hours. + Defaults to a path under the user cache directory. Override this to put the + file somewhere writable in environments where the user cache is read-only + (e.g. a Kubernetes volume mount). The location is decoupled from fetching: + the solver will refresh the file at this location from `grey_list_url`. + """ + + grey_list_url: str = env_field("GREY_LIST_URL", default=DEFAULT_GREY_LIST_URL) + """ + URL to fetch the grey list from at solve time. + + Set to an empty string (e.g. `REF_GREY_LIST_URL=`) to disable fetching + entirely; in that case the solver will use whatever file already exists at + `grey_list_file`. Override the URL to point at a fork or mirror. """ paths: PathConfig = Factory(PathConfig) diff --git a/packages/climate-ref/src/climate_ref/conftest_plugin.py b/packages/climate-ref/src/climate_ref/conftest_plugin.py index 2e97677e8..5400f1118 100644 --- a/packages/climate-ref/src/climate_ref/conftest_plugin.py +++ b/packages/climate-ref/src/climate_ref/conftest_plugin.py @@ -251,12 +251,13 @@ def data_catalog( @pytest.fixture(scope="session") def solve_config() -> Config: - """Session-scoped Config that uses the local default_ignore_datasets.yaml""" + """Session-scoped Config that uses the in-tree default_grey_list.yaml and never fetches.""" cfg = Config.default() - local_ignore_file = Path(__file__).parents[4] / "default_ignore_datasets.yaml" - if not local_ignore_file.is_file(): - raise ValueError(f"Could not find ignore file at {local_ignore_file}") - cfg.ignore_datasets_file = local_ignore_file + local_grey_list = Path(__file__).parents[4] / "default_grey_list.yaml" + if not local_grey_list.is_file(): + raise ValueError(f"Could not find grey list file at {local_grey_list}") + cfg.grey_list_file = local_grey_list + cfg.grey_list_url = "" return cfg @@ -274,6 +275,10 @@ def config(tmp_path: Path, monkeypatch: pytest.MonkeyPatch, request: pytest.Fixt cfg.paths.software = software_path cfg.diagnostic_providers = [DiagnosticProviderConfig(provider="climate_ref_example")] cfg.executor.executor = "climate_ref.executor.SynchronousExecutor" + # Isolate tests from the user's grey list cache and disable fetching. + cfg.grey_list_file = tmp_path / "grey_list.yaml" + cfg.grey_list_file.touch() + cfg.grey_list_url = "" cfg.save() return cfg diff --git a/packages/climate-ref/src/climate_ref/provider_registry.py b/packages/climate-ref/src/climate_ref/provider_registry.py index 9b98766a1..a8eb31f6f 100644 --- a/packages/climate-ref/src/climate_ref/provider_registry.py +++ b/packages/climate-ref/src/climate_ref/provider_registry.py @@ -12,7 +12,7 @@ from attrs import field, frozen from loguru import logger -from climate_ref.config import Config +from climate_ref.config import Config, refresh_grey_list_file from climate_ref.database import Database from climate_ref.models import Diagnostic as DiagnosticModel from climate_ref.models import Provider as ProviderModel @@ -134,6 +134,9 @@ def build_from_config(config: Config, db: Database) -> "ProviderRegistry": : A new ProviderRegistry instance """ + if config.grey_list_url: + refresh_grey_list_file(config.grey_list_file, config.grey_list_url) + providers = [] for provider_info in config.diagnostic_providers: provider = import_provider(provider_info.provider) diff --git a/packages/climate-ref/tests/unit/test_config.py b/packages/climate-ref/tests/unit/test_config.py index aeb7abf4b..ffe8b7a10 100644 --- a/packages/climate-ref/tests/unit/test_config.py +++ b/packages/climate-ref/tests/unit/test_config.py @@ -12,10 +12,11 @@ import climate_ref.config from climate_ref.config import ( + DEFAULT_GREY_LIST_URL, DEFAULT_LOG_FORMAT, Config, PathConfig, - _get_default_ignore_datasets_file, + refresh_grey_list_file, transform_error, ) from climate_ref_core.exceptions import InvalidExecutorException @@ -153,9 +154,8 @@ def test_defaults(self, monkeypatch, mocker): without_defaults = cfg.dump(defaults=False) assert without_defaults == { - "ignore_datasets_file": str( - platformdirs.user_cache_path("climate_ref") / "default_ignore_datasets.yaml" - ), + "grey_list_file": str(platformdirs.user_cache_path("climate_ref") / "default_grey_list.yaml"), + "grey_list_url": DEFAULT_GREY_LIST_URL, "log_level": "INFO", "log_format": DEFAULT_LOG_FORMAT, "cmip6_parser": "complete", @@ -165,9 +165,8 @@ def test_defaults(self, monkeypatch, mocker): ], } assert with_defaults == { - "ignore_datasets_file": str( - platformdirs.user_cache_path("climate_ref") / "default_ignore_datasets.yaml" - ), + "grey_list_file": str(platformdirs.user_cache_path("climate_ref") / "default_grey_list.yaml"), + "grey_list_url": DEFAULT_GREY_LIST_URL, "log_level": "INFO", "log_format": DEFAULT_LOG_FORMAT, "cmip6_parser": "complete", @@ -270,51 +269,58 @@ def test_transform_error(): @pytest.mark.parametrize("status", ["fresh", "stale", "missing"]) -def test_get_default_ignore_datasets_file(mocker, tmp_path, status): - mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path) +def test_refresh_grey_list_file(mocker, tmp_path, status): mocker.patch.object( climate_ref.config.requests, "get", return_value=mocker.MagicMock(status_code=200, content=b"downloaded"), ) - expected_path = tmp_path / "default_ignore_datasets.yaml" + target = tmp_path / "nested" / "grey_list.yaml" if status != "missing": - expected_path.write_text("existing", encoding="utf-8") - if status == "stale": - mocker.patch.object(climate_ref.config, "DEFAULT_IGNORE_DATASETS_MAX_AGE", timedelta(seconds=-1)) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("existing", encoding="utf-8") + max_age = timedelta(seconds=-1) if status == "stale" else timedelta(hours=6) - path = climate_ref.config._get_default_ignore_datasets_file() + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml", max_age=max_age) - assert path == tmp_path / "default_ignore_datasets.yaml" + assert target.parent.exists() if status == "fresh": - assert path.read_text(encoding="utf-8") == "existing" + assert target.read_text(encoding="utf-8") == "existing" else: - assert path.read_text(encoding="utf-8") == "downloaded" + assert target.read_text(encoding="utf-8") == "downloaded" -def test_get_default_ignore_datasets_file_fail(mocker, tmp_path): - mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path) +def test_refresh_grey_list_file_fail(mocker, tmp_path): result = mocker.MagicMock(status_code=404, content=b"{}") result.raise_for_status.side_effect = requests.RequestException mocker.patch.object(climate_ref.config.requests, "get", return_value=result) - path = _get_default_ignore_datasets_file() - assert path == tmp_path / "default_ignore_datasets.yaml" - assert path.parent.exists() - assert path.read_text(encoding="utf-8") == "" + target = tmp_path / "grey_list.yaml" + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") + assert target.exists() + assert target.read_text(encoding="utf-8") == "" -def test_get_default_ignore_datasets_file_network_error(mocker, tmp_path): - """Test that network errors during requests.get() are handled gracefully.""" - mocker.patch.object(climate_ref.config.platformdirs, "user_cache_path", return_value=tmp_path) - # Simulate network error (e.g., no network access in offline/HPC environment) + +def test_refresh_grey_list_file_network_error(mocker, tmp_path): + """Network errors (e.g. offline/HPC environment) are handled gracefully.""" mocker.patch.object( climate_ref.config.requests, "get", side_effect=requests.exceptions.ConnectionError("Network unreachable"), ) - path = _get_default_ignore_datasets_file() - assert path == tmp_path / "default_ignore_datasets.yaml" - assert path.parent.exists() - assert path.read_text(encoding="utf-8") == "" + target = tmp_path / "grey_list.yaml" + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") + + assert target.exists() + assert target.read_text(encoding="utf-8") == "" + + +def test_config_default_does_not_fetch(mocker): + """`Config.default()` must not perform any network I/O.""" + get_mock = mocker.patch.object(climate_ref.config.requests, "get") + + Config.default() + + get_mock.assert_not_called() From ac709c903d80e945c1d30e9b98e48859c19f1237 Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sun, 12 Apr 2026 21:34:23 +1000 Subject: [PATCH 2/7] fix(grey-list): preserve Path conversion, tolerate missing file, fetch only on solve Three fixes from the Codex review of the prior commit, plus user-facing documentation: - `Config.grey_list_file` now uses `converter=Path`, so `REF_GREY_LIST_FILE` overrides actually produce a `Path` instead of a bare string and downstream `read_text()` / `exists()` calls keep working. - `DiagnosticProvider.configure` treats a missing grey list file as an empty list rather than raising `FileNotFoundError`. This unblocks offline runs where the user disables fetching with `grey_list_url=""` before the file has been seeded. - The `refresh_grey_list_file` call moves from `ProviderRegistry.build_from_config` to `ExecutionSolver.build_from_db`, so read-only commands like `ref providers list` and `ref datasets list` no longer touch the network. Fetching only happens at solve time. Also adds a "Grey list" section to `docs/configuration.md` covering the two configuration values, the offline / air-gapped HPC workflow, and how the lazy fetch behaves. --- docs/configuration.md | 66 +++++++++++++++++++ .../src/climate_ref_core/providers.py | 8 ++- .../tests/unit/test_providers.py | 7 ++ .../climate-ref/src/climate_ref/config.py | 6 +- .../src/climate_ref/provider_registry.py | 5 +- .../climate-ref/src/climate_ref/solver.py | 5 +- .../climate-ref/tests/unit/test_config.py | 7 ++ .../tests/unit/test_provider_registry.py | 11 ++++ .../climate-ref/tests/unit/test_solver.py | 16 +++++ 9 files changed, 124 insertions(+), 7 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index e2bed7a8f..2ca468dc7 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -95,6 +95,72 @@ If this is set, then the sample data won't be updated. Path where the test output is stored. This is used to store the output of the tests that are run in the test suite for later inspection. +## Grey list + +The *grey list* is a YAML file that lists facets which should be excluded from +specific diagnostics — for example, datasets that are known to crash or +produce invalid output. Each provider's `configure()` step prepends grey-list +entries as `IgnoreFacets` constraints onto the relevant data requirements, so +matching datasets are silently filtered out before solving. + +The file format is: + +```yaml +provider: + diagnostic: + source_type: + - facet: value + - other_facet: [other_value1, other_value2] +``` + +Two configuration values control how the grey list is loaded; both can be set +in your `ref.toml` or via environment variables. + +### `grey_list_file` / `REF_GREY_LIST_FILE` + +Path to the grey list file on disk. Defaults to a location under the user +cache directory (the same locations listed under +[`REF_DATASET_CACHE_DIR`](#ref_dataset_cache_dir)). + +Override this when the default cache location is not writable — for example +on Kubernetes pods with a read-only home directory, or on HPC compute nodes +where you want the file to live on a shared filesystem so every job sees the +same list. The location is decoupled from fetching: when a URL is configured, +the solver will refresh the file at this path regardless of where it lives. + +### `grey_list_url` / `REF_GREY_LIST_URL` + +URL the solver fetches the grey list from. Defaults to the `default_grey_list.yaml` +on the `main` branch of the `Climate-REF/climate-ref` GitHub repository. +Override this to point at a fork or internal mirror. + +The download is **lazy and explicit**: it only runs once at the start of a +solve (`ExecutionSolver.build_from_db`), and only when the on-disk copy is +missing or older than 6 hours. Read-only commands like `ref providers list` +or `ref datasets list` never touch the network. + +### Offline / air-gapped use (HPC) + +To run completely offline — for example on an HPC compute node with no +outbound network — set the URL to an empty value: + +```bash +export REF_GREY_LIST_URL= +``` + +or in `ref.toml`: + +```toml +grey_list_url = "" +``` + +When fetching is disabled the solver simply uses whatever file is at +`grey_list_file`. A missing file is treated as an empty grey list, so you do +not have to seed the file by hand; if you want to apply a specific grey list, +either copy `default_grey_list.yaml` from the repository to your +`grey_list_file` location ahead of time, or fetch it once on a login node +before disabling the URL on the compute nodes. + ## Configuration Options diff --git a/packages/climate-ref-core/src/climate_ref_core/providers.py b/packages/climate-ref-core/src/climate_ref_core/providers.py index 7e991c1cc..b239c7ce2 100644 --- a/packages/climate-ref-core/src/climate_ref_core/providers.py +++ b/packages/climate-ref-core/src/climate_ref_core/providers.py @@ -86,7 +86,13 @@ def configure(self, config: Any) -> None: # source_type: # - facet: value # - other_facet: [other_value1, other_value2] - grey_list_all = yaml.safe_load(config.grey_list_file.read_text(encoding="utf-8")) or {} + # A missing file is treated as an empty grey list so offline/air-gapped + # users that disable fetching with `grey_list_url=""` can run without + # having to seed the file themselves. + if not config.grey_list_file.exists(): + grey_list_all: dict[str, Any] = {} + else: + grey_list_all = yaml.safe_load(config.grey_list_file.read_text(encoding="utf-8")) or {} grey_list = grey_list_all.get(self.slug, {}) if unknown_slugs := {slug for slug in grey_list} - {d.slug for d in self.diagnostics()}: logger.warning( diff --git a/packages/climate-ref-core/tests/unit/test_providers.py b/packages/climate-ref-core/tests/unit/test_providers.py index 7873ec0b5..bef179578 100644 --- a/packages/climate-ref-core/tests/unit/test_providers.py +++ b/packages/climate-ref-core/tests/unit/test_providers.py @@ -85,6 +85,13 @@ def test_configure(self, provider, mock_config): expected_constraint = IgnoreFacets(facets={"source_id": ("A",)}) assert provider.diagnostics()[0].data_requirements[0][0].constraints[0] == expected_constraint + def test_configure_missing_grey_list_file(self, provider, mock_config): + # Offline/air-gapped users may run with grey_list_url="" and no file + # seeded yet; missing file should be treated as an empty grey list, + # not raise FileNotFoundError. + mock_config.grey_list_file.unlink() + provider.configure(mock_config) + def test_configure_unknown_diagnostic(self, provider, mock_config, caplog): mock_config.grey_list_file.write_text( textwrap.dedent( diff --git a/packages/climate-ref/src/climate_ref/config.py b/packages/climate-ref/src/climate_ref/config.py index 4ca21d56b..0cd7ebe85 100644 --- a/packages/climate-ref/src/climate_ref/config.py +++ b/packages/climate-ref/src/climate_ref/config.py @@ -438,7 +438,11 @@ class Config: - `complete`: Use the complete parser, which parses the dataset based on all available metadata. """ - grey_list_file: Path = env_field("GREY_LIST_FILE", factory=_default_grey_list_path) # noqa: RUF009 + grey_list_file: Path = env_field( # noqa: RUF009 + "GREY_LIST_FILE", + factory=_default_grey_list_path, + converter=Path, + ) """ Path to the grey list file. diff --git a/packages/climate-ref/src/climate_ref/provider_registry.py b/packages/climate-ref/src/climate_ref/provider_registry.py index a8eb31f6f..9b98766a1 100644 --- a/packages/climate-ref/src/climate_ref/provider_registry.py +++ b/packages/climate-ref/src/climate_ref/provider_registry.py @@ -12,7 +12,7 @@ from attrs import field, frozen from loguru import logger -from climate_ref.config import Config, refresh_grey_list_file +from climate_ref.config import Config from climate_ref.database import Database from climate_ref.models import Diagnostic as DiagnosticModel from climate_ref.models import Provider as ProviderModel @@ -134,9 +134,6 @@ def build_from_config(config: Config, db: Database) -> "ProviderRegistry": : A new ProviderRegistry instance """ - if config.grey_list_url: - refresh_grey_list_file(config.grey_list_file, config.grey_list_url) - providers = [] for provider_info in config.diagnostic_providers: provider = import_provider(provider_info.provider) diff --git a/packages/climate-ref/src/climate_ref/solver.py b/packages/climate-ref/src/climate_ref/solver.py index 74f4eeb53..c3e832974 100644 --- a/packages/climate-ref/src/climate_ref/solver.py +++ b/packages/climate-ref/src/climate_ref/solver.py @@ -13,7 +13,7 @@ from attrs import define, frozen from loguru import logger -from climate_ref.config import Config +from climate_ref.config import Config, refresh_grey_list_file from climate_ref.data_catalog import DataCatalog from climate_ref.database import Database from climate_ref.datasets import ( @@ -470,6 +470,9 @@ def build_from_db(config: Config, db: Database) -> "ExecutionSolver": : A new ExecutionSolver instance """ + if config.grey_list_url: + refresh_grey_list_file(config.grey_list_file, config.grey_list_url) + return ExecutionSolver( provider_registry=ProviderRegistry.build_from_config(config, db), data_catalog={ diff --git a/packages/climate-ref/tests/unit/test_config.py b/packages/climate-ref/tests/unit/test_config.py index ffe8b7a10..db75d2656 100644 --- a/packages/climate-ref/tests/unit/test_config.py +++ b/packages/climate-ref/tests/unit/test_config.py @@ -199,6 +199,8 @@ def test_from_env_variables(self, monkeypatch, config): monkeypatch.setenv("REF_LOG_ROOT", "/my/test/logs") monkeypatch.setenv("REF_RESULTS_ROOT", "/my/test/executions") monkeypatch.setenv("REF_CMIP6_PARSER", "drs") + monkeypatch.setenv("REF_GREY_LIST_FILE", "/my/test/grey_list.yaml") + monkeypatch.setenv("REF_GREY_LIST_URL", "") config_new = config.refresh() @@ -208,6 +210,11 @@ def test_from_env_variables(self, monkeypatch, config): assert config_new.paths.log == Path("/my/test/logs") assert config_new.paths.results == Path("/my/test/executions") assert config_new.cmip6_parser == "drs" + # Env overrides for grey_list_file must be coerced to Path so callers + # can use .read_text(), .exists(), etc. + assert config_new.grey_list_file == Path("/my/test/grey_list.yaml") + assert isinstance(config_new.grey_list_file, Path) + assert config_new.grey_list_url == "" def test_custom_env_variable(self, monkeypatch, tmp_path, config): monkeypatch.setenv("ABC", "/my") diff --git a/packages/climate-ref/tests/unit/test_provider_registry.py b/packages/climate-ref/tests/unit/test_provider_registry.py index a48db081d..9589de397 100644 --- a/packages/climate-ref/tests/unit/test_provider_registry.py +++ b/packages/climate-ref/tests/unit/test_provider_registry.py @@ -17,6 +17,17 @@ def test_create(self, config, mocker): assert mock_import.call_count == 1 mock_register.assert_called_once_with(db, mock_import.return_value) + def test_build_from_config_does_not_fetch_grey_list(self, config, mocker): + """Read-only commands (providers/datasets list) must not hit the network.""" + db = mocker.MagicMock() + mocker.patch("climate_ref.provider_registry.import_provider") + mocker.patch("climate_ref.provider_registry._register_provider") + get_mock = mocker.patch("climate_ref.config.requests.get") + + ProviderRegistry.build_from_config(config, db) + + get_mock.assert_not_called() + def test_get_provider_found(self, mocker): """Test get() returns provider when found.""" mock_provider = mocker.MagicMock() diff --git a/packages/climate-ref/tests/unit/test_solver.py b/packages/climate-ref/tests/unit/test_solver.py index 96c2709a0..39c5621f8 100644 --- a/packages/climate-ref/tests/unit/test_solver.py +++ b/packages/climate-ref/tests/unit/test_solver.py @@ -83,6 +83,22 @@ def test_solver_build_from_db(self, solver): assert isinstance(solver.data_catalog[SourceDatasetType.CMIP6], DataCatalog) assert len(solver.data_catalog[SourceDatasetType.CMIP6].to_frame()) + def test_build_from_db_refreshes_grey_list(self, db_seeded, config, mocker): + refresh = mocker.patch("climate_ref.solver.refresh_grey_list_file") + config.grey_list_url = "https://example.invalid/grey_list.yaml" + + ExecutionSolver.build_from_db(config, db_seeded) + + refresh.assert_called_once_with(config.grey_list_file, config.grey_list_url) + + def test_build_from_db_skips_refresh_when_url_empty(self, db_seeded, config, mocker): + refresh = mocker.patch("climate_ref.solver.refresh_grey_list_file") + config.grey_list_url = "" + + ExecutionSolver.build_from_db(config, db_seeded) + + refresh.assert_not_called() + class TestExtractCoveredDatasetsWithDataCatalog: """Test that extract_covered_datasets triggers finalisation when given a DataCatalog.""" From d8534994d9bd217513e5639d9156d03e066ac9db Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sun, 12 Apr 2026 21:37:46 +1000 Subject: [PATCH 3/7] chore: remove changelog --- changelog/590.breaking.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 changelog/590.breaking.md diff --git a/changelog/590.breaking.md b/changelog/590.breaking.md deleted file mode 100644 index 689671594..000000000 --- a/changelog/590.breaking.md +++ /dev/null @@ -1 +0,0 @@ -Renamed the "ignore datasets" configuration to "grey list" and decoupled fetching from configuration loading. The `default_ignore_datasets.yaml` file is now `default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_*` environment variables are now `REF_GREY_LIST_*`. A new `Config.grey_list_url` field controls where the grey list is fetched from at solve time; set it (or the `REF_GREY_LIST_URL` environment variable) to an empty value to disable fetching, which is useful for offline or air-gapped environments. The grey list location is now independent of the fetch lifecycle so users can pin it to a writable location (e.g. a Kubernetes volume mount) and still have the solver refresh it. `Config.default()` no longer performs network I/O as a side effect; the download now happens explicitly during solve setup. From ef2b35c115faec3b1947f23fc23d661ea4fde15d Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sun, 12 Apr 2026 21:38:46 +1000 Subject: [PATCH 4/7] docs: add changelog for #622 --- changelog/622.breaking.md | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 changelog/622.breaking.md diff --git a/changelog/622.breaking.md b/changelog/622.breaking.md new file mode 100644 index 000000000..134c56b54 --- /dev/null +++ b/changelog/622.breaking.md @@ -0,0 +1,9 @@ +Renamed the "ignore datasets" configuration to "grey list" and decoupled fetching from configuration loading. + +The `default_ignore_datasets.yaml` file is now `default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_*` environment variables are now `REF_GREY_LIST_*`. + +A new `Config.grey_list_url` field (also overridable via `REF_GREY_LIST_URL`) controls where the grey list is fetched from. Set it to an empty value to disable fetching, which is useful for offline or air-gapped HPC environments. The grey list location is now independent of the fetch lifecycle, so users can pin it to a writable location (e.g. a Kubernetes volume mount) and still have the solver refresh it. + +`Config.default()` no longer performs network I/O as a side effect — the download now happens explicitly during solve setup, so read-only commands like `ref providers list` no longer touch the network. A missing grey list file is treated as an empty list rather than raising an error, so disabling fetching does not require pre-seeding the file. + +See the new "Grey list" section in `docs/configuration.md` for full details. From 42a7e0c6c19f568d90ca054b692ef5231a417cce Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Sun, 12 Apr 2026 22:13:26 +1000 Subject: [PATCH 5/7] chore(grey-list): move shipped file to config/ and default cache to REF_CONFIGURATION - Relocate `default_grey_list.yaml` from the repo root to `config/default_grey_list.yaml` so the repo root stays uncluttered; this is also a step toward eventually owning the official version in the climate-ref-aft repo. - Default `Config.grey_list_file` now resolves to `${REF_CONFIGURATION}/grey_list.yaml` instead of the user cache directory. The configuration dir is reliably writable on container/HPC deployments where `~/.cache` often is not. - Update the default fetch URL, the test fixture path, and the docs accordingly. --- changelog/622.breaking.md | 5 ++- .../default_grey_list.yaml | 0 docs/configuration.md | 43 +++++++++---------- .../climate-ref/src/climate_ref/config.py | 20 +++++---- .../src/climate_ref/conftest_plugin.py | 4 +- .../climate-ref/tests/unit/test_config.py | 5 +-- 6 files changed, 40 insertions(+), 37 deletions(-) rename default_grey_list.yaml => config/default_grey_list.yaml (100%) diff --git a/changelog/622.breaking.md b/changelog/622.breaking.md index 134c56b54..bd1459ea2 100644 --- a/changelog/622.breaking.md +++ b/changelog/622.breaking.md @@ -1,9 +1,10 @@ Renamed the "ignore datasets" configuration to "grey list" and decoupled fetching from configuration loading. -The `default_ignore_datasets.yaml` file is now `default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_*` environment variables are now `REF_GREY_LIST_*`. +The `default_ignore_datasets.yaml` file at the repository root is now `config/default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_FILE` environment variables are now `REF_GREY_LIST_FILE`. A new `Config.grey_list_url` field (also overridable via `REF_GREY_LIST_URL`) controls where the grey list is fetched from. Set it to an empty value to disable fetching, which is useful for offline or air-gapped HPC environments. The grey list location is now independent of the fetch lifecycle, so users can pin it to a writable location (e.g. a Kubernetes volume mount) and still have the solver refresh it. -`Config.default()` no longer performs network I/O as a side effect — the download now happens explicitly during solve setup, so read-only commands like `ref providers list` no longer touch the network. A missing grey list file is treated as an empty list rather than raising an error, so disabling fetching does not require pre-seeding the file. +`Config.default()` no longer performs network I/O as a side effect. +A missing grey list file is treated as an empty list rather than raising an error, so disabling fetching does not require pre-seeding the file. See the new "Grey list" section in `docs/configuration.md` for full details. diff --git a/default_grey_list.yaml b/config/default_grey_list.yaml similarity index 100% rename from default_grey_list.yaml rename to config/default_grey_list.yaml diff --git a/docs/configuration.md b/docs/configuration.md index 2ca468dc7..6576c907a 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -98,10 +98,8 @@ This is used to store the output of the tests that are run in the test suite for ## Grey list The *grey list* is a YAML file that lists facets which should be excluded from -specific diagnostics — for example, datasets that are known to crash or -produce invalid output. Each provider's `configure()` step prepends grey-list -entries as `IgnoreFacets` constraints onto the relevant data requirements, so -matching datasets are silently filtered out before solving. +specific diagnostics — for example, datasets that are known to crash or produce invalid output. +The datasets in the grey list are filtered before solving for the relevant diagnostic. The file format is: @@ -118,25 +116,24 @@ in your `ref.toml` or via environment variables. ### `grey_list_file` / `REF_GREY_LIST_FILE` -Path to the grey list file on disk. Defaults to a location under the user -cache directory (the same locations listed under -[`REF_DATASET_CACHE_DIR`](#ref_dataset_cache_dir)). - -Override this when the default cache location is not writable — for example -on Kubernetes pods with a read-only home directory, or on HPC compute nodes -where you want the file to live on a shared filesystem so every job sees the -same list. The location is decoupled from fetching: when a URL is configured, -the solver will refresh the file at this path regardless of where it lives. +Path to the grey list file on disk. +Defaults to `grey_list.yaml` inside your `REF_CONFIGURATION` directory (alongside `ref.toml`, the database, +etc.). This location must be writable by the user as the grey +list may be updated periodically. ### `grey_list_url` / `REF_GREY_LIST_URL` -URL the solver fetches the grey list from. Defaults to the `default_grey_list.yaml` -on the `main` branch of the `Climate-REF/climate-ref` GitHub repository. -Override this to point at a fork or internal mirror. +URL the solver fetches the grey list from. +Defaults to +`config/default_grey_list.yaml` on the `main` branch of the +`Climate-REF/climate-ref` GitHub repository. +Override this to point at a +fork or internal mirror. The download is **lazy and explicit**: it only runs once at the start of a solve (`ExecutionSolver.build_from_db`), and only when the on-disk copy is -missing or older than 6 hours. Read-only commands like `ref providers list` +missing or older than 6 hours. +Read-only commands like `ref providers list` or `ref datasets list` never touch the network. ### Offline / air-gapped use (HPC) @@ -155,11 +152,13 @@ grey_list_url = "" ``` When fetching is disabled the solver simply uses whatever file is at -`grey_list_file`. A missing file is treated as an empty grey list, so you do -not have to seed the file by hand; if you want to apply a specific grey list, -either copy `default_grey_list.yaml` from the repository to your -`grey_list_file` location ahead of time, or fetch it once on a login node -before disabling the URL on the compute nodes. +`grey_list_file`. +A missing file is treated as an empty grey list, so you +do not have to seed the file by hand; +if you want to apply a specific +grey list, either copy `config/default_grey_list.yaml` from the repository +to your `grey_list_file` location ahead of time, +or fetch it once before disabling the URL on the compute nodes. ## Configuration Options diff --git a/packages/climate-ref/src/climate_ref/config.py b/packages/climate-ref/src/climate_ref/config.py index 0cd7ebe85..664d85ce3 100644 --- a/packages/climate-ref/src/climate_ref/config.py +++ b/packages/climate-ref/src/climate_ref/config.py @@ -20,7 +20,6 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal -import platformdirs import requests import tomlkit from attr import Factory @@ -344,13 +343,19 @@ def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config": DEFAULT_GREY_LIST_MAX_AGE = datetime.timedelta(hours=6) DEFAULT_GREY_LIST_URL = ( - "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/default_grey_list.yaml" + "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/config/default_grey_list.yaml" ) def _default_grey_list_path() -> Path: - """Return the default location of the grey list file (no I/O, no network).""" - return platformdirs.user_cache_path("climate_ref") / "default_grey_list.yaml" + """Return the default location of the grey list file (no I/O, no network). + + Lives under `REF_CONFIGURATION` (alongside `ref.toml`, the database, etc.) + so it inherits the same writable directory the user already has set up, + rather than the user cache directory which is often read-only on + container/HPC deployments. + """ + return env.path("REF_CONFIGURATION").resolve() / "grey_list.yaml" def refresh_grey_list_file( @@ -457,10 +462,9 @@ class Config: - another_facet: [another_value1, another_value2] ``` - Defaults to a path under the user cache directory. Override this to put the - file somewhere writable in environments where the user cache is read-only - (e.g. a Kubernetes volume mount). The location is decoupled from fetching: - the solver will refresh the file at this location from `grey_list_url`. + Defaults to a path under the configuration directory. + This location must be writable as the solver may refresh the file from `grey_list_url` + if it is missing or stale. """ grey_list_url: str = env_field("GREY_LIST_URL", default=DEFAULT_GREY_LIST_URL) diff --git a/packages/climate-ref/src/climate_ref/conftest_plugin.py b/packages/climate-ref/src/climate_ref/conftest_plugin.py index 5400f1118..1d94ad144 100644 --- a/packages/climate-ref/src/climate_ref/conftest_plugin.py +++ b/packages/climate-ref/src/climate_ref/conftest_plugin.py @@ -251,9 +251,9 @@ def data_catalog( @pytest.fixture(scope="session") def solve_config() -> Config: - """Session-scoped Config that uses the in-tree default_grey_list.yaml and never fetches.""" + """Session-scoped Config that uses the in-tree grey list and never fetches.""" cfg = Config.default() - local_grey_list = Path(__file__).parents[4] / "default_grey_list.yaml" + local_grey_list = Path(__file__).parents[4] / "config" / "default_grey_list.yaml" if not local_grey_list.is_file(): raise ValueError(f"Could not find grey list file at {local_grey_list}") cfg.grey_list_file = local_grey_list diff --git a/packages/climate-ref/tests/unit/test_config.py b/packages/climate-ref/tests/unit/test_config.py index db75d2656..fca044b84 100644 --- a/packages/climate-ref/tests/unit/test_config.py +++ b/packages/climate-ref/tests/unit/test_config.py @@ -4,7 +4,6 @@ from datetime import timedelta from pathlib import Path -import platformdirs import pytest import requests from attr import evolve @@ -154,7 +153,7 @@ def test_defaults(self, monkeypatch, mocker): without_defaults = cfg.dump(defaults=False) assert without_defaults == { - "grey_list_file": str(platformdirs.user_cache_path("climate_ref") / "default_grey_list.yaml"), + "grey_list_file": f"{default_path}/grey_list.yaml", "grey_list_url": DEFAULT_GREY_LIST_URL, "log_level": "INFO", "log_format": DEFAULT_LOG_FORMAT, @@ -165,7 +164,7 @@ def test_defaults(self, monkeypatch, mocker): ], } assert with_defaults == { - "grey_list_file": str(platformdirs.user_cache_path("climate_ref") / "default_grey_list.yaml"), + "grey_list_file": f"{default_path}/grey_list.yaml", "grey_list_url": DEFAULT_GREY_LIST_URL, "log_level": "INFO", "log_format": DEFAULT_LOG_FORMAT, From 0abd3fb9c295c461a5429f19203b7ddca3878cae Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Mon, 13 Apr 2026 11:53:32 +1000 Subject: [PATCH 6/7] docs: address PR review comments - Preserve existing grey list file mtime on download failure - Fix grammar in changelog breaking change note --- changelog/622.breaking.md | 2 +- packages/climate-ref/src/climate_ref/config.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/changelog/622.breaking.md b/changelog/622.breaking.md index bd1459ea2..f69cc73f6 100644 --- a/changelog/622.breaking.md +++ b/changelog/622.breaking.md @@ -1,6 +1,6 @@ Renamed the "ignore datasets" configuration to "grey list" and decoupled fetching from configuration loading. -The `default_ignore_datasets.yaml` file at the repository root is now `config/default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_FILE` environment variables are now `REF_GREY_LIST_FILE`. +The `default_ignore_datasets.yaml` file at the repository root is now `config/default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_FILE` environment variable is now `REF_GREY_LIST_FILE`. A new `Config.grey_list_url` field (also overridable via `REF_GREY_LIST_URL`) controls where the grey list is fetched from. Set it to an empty value to disable fetching, which is useful for offline or air-gapped HPC environments. The grey list location is now independent of the fetch lifecycle, so users can pin it to a writable location (e.g. a Kubernetes volume mount) and still have the solver refresh it. diff --git a/packages/climate-ref/src/climate_ref/config.py b/packages/climate-ref/src/climate_ref/config.py index 664d85ce3..16039b986 100644 --- a/packages/climate-ref/src/climate_ref/config.py +++ b/packages/climate-ref/src/climate_ref/config.py @@ -395,7 +395,8 @@ def refresh_grey_list_file( response.raise_for_status() except requests.RequestException as exc: logger.warning(f"Failed to download grey list: {exc}") - path.touch(exist_ok=True) + if not path.exists(): + path.touch() else: with path.open(mode="wb") as file: file.write(response.content) From 98e600b7ccee48224272a57efd63cc920842e87d Mon Sep 17 00:00:00 2001 From: Jared Lewis Date: Tue, 14 Apr 2026 20:22:38 +1000 Subject: [PATCH 7/7] fix(grey-list): hard-fail on deprecated config and make refresh fail-safe Loading a config that still uses `ignore_datasets_file` or `REF_IGNORE_DATASETS_FILE` now raises a `ValueError` with migration instructions instead of silently dropping the value and falling back to the new defaults. Marked as a TODO to remove the shim before 1.0.0. Grey list refresh no longer creates an empty placeholder when the download fails: a cached file is reused if present, otherwise a new `GreyListRefreshError` is raised so a transient network failure cannot silently disable grey list protections for 6 hours. Also rewrap the new prose (changelog, docs section, docstrings) to follow semantic line breaks. --- changelog/622.breaking.md | 26 ++++++- docs/configuration.md | 54 ++++++------- .../src/climate_ref_core/providers.py | 6 +- .../climate-ref/src/climate_ref/config.py | 76 +++++++++++++++---- .../climate-ref/tests/unit/test_config.py | 53 ++++++++++--- 5 files changed, 157 insertions(+), 58 deletions(-) diff --git a/changelog/622.breaking.md b/changelog/622.breaking.md index f69cc73f6..ba5c6dcc3 100644 --- a/changelog/622.breaking.md +++ b/changelog/622.breaking.md @@ -1,10 +1,28 @@ -Renamed the "ignore datasets" configuration to "grey list" and decoupled fetching from configuration loading. +Renamed the "ignore datasets" configuration to "grey list" +and decoupled fetching from configuration loading. -The `default_ignore_datasets.yaml` file at the repository root is now `config/default_grey_list.yaml`, the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, and the `REF_IGNORE_DATASETS_FILE` environment variable is now `REF_GREY_LIST_FILE`. +The `default_ignore_datasets.yaml` file at the repository root is now `config/default_grey_list.yaml`, +the `Config.ignore_datasets_file` field is now `Config.grey_list_file`, +and the `REF_IGNORE_DATASETS_FILE` environment variable is now `REF_GREY_LIST_FILE`. -A new `Config.grey_list_url` field (also overridable via `REF_GREY_LIST_URL`) controls where the grey list is fetched from. Set it to an empty value to disable fetching, which is useful for offline or air-gapped HPC environments. The grey list location is now independent of the fetch lifecycle, so users can pin it to a writable location (e.g. a Kubernetes volume mount) and still have the solver refresh it. +A new `Config.grey_list_url` field (also overridable via `REF_GREY_LIST_URL`) controls where the grey list is fetched from. +Set it to an empty value to disable fetching, +which is useful for offline or air-gapped HPC environments. +The grey list location is now independent of the fetch lifecycle, +so users can pin it to a writable location (e.g. a Kubernetes volume mount) +and still have the solver refresh it. `Config.default()` no longer performs network I/O as a side effect. -A missing grey list file is treated as an empty list rather than raising an error, so disabling fetching does not require pre-seeding the file. +A missing grey list file is treated as an empty list rather than raising an error, +so disabling fetching does not require pre-seeding the file. + +Loading a configuration that still uses the deprecated `ignore_datasets_file` key, +or running with the `REF_IGNORE_DATASETS_FILE` environment variable set, +now raises a hard error with migration instructions instead of silently falling back to defaults. + +Grey list refresh is now fail-safe: +a network failure with no cached file raises `GreyListRefreshError` +rather than creating an empty placeholder that would silently disable grey list protections for the next 6 hours. +If a cached file exists it is reused. See the new "Grey list" section in `docs/configuration.md` for full details. diff --git a/docs/configuration.md b/docs/configuration.md index 6576c907a..7b0a3a56b 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -97,8 +97,8 @@ This is used to store the output of the tests that are run in the test suite for ## Grey list -The *grey list* is a YAML file that lists facets which should be excluded from -specific diagnostics — for example, datasets that are known to crash or produce invalid output. +The *grey list* is a YAML file that lists facets which should be excluded from specific diagnostics +— for example, datasets that are known to crash or produce invalid output. The datasets in the grey list are filtered before solving for the relevant diagnostic. The file format is: @@ -111,35 +111,37 @@ provider: - other_facet: [other_value1, other_value2] ``` -Two configuration values control how the grey list is loaded; both can be set -in your `ref.toml` or via environment variables. +Two configuration values control how the grey list is loaded; +both can be set in your `ref.toml` or via environment variables. ### `grey_list_file` / `REF_GREY_LIST_FILE` Path to the grey list file on disk. -Defaults to `grey_list.yaml` inside your `REF_CONFIGURATION` directory (alongside `ref.toml`, the database, -etc.). This location must be writable by the user as the grey -list may be updated periodically. +Defaults to `grey_list.yaml` inside your `REF_CONFIGURATION` directory +(alongside `ref.toml`, the database, etc.). +This location must be writable by the user +as the grey list may be updated periodically. ### `grey_list_url` / `REF_GREY_LIST_URL` URL the solver fetches the grey list from. -Defaults to -`config/default_grey_list.yaml` on the `main` branch of the -`Climate-REF/climate-ref` GitHub repository. -Override this to point at a -fork or internal mirror. - -The download is **lazy and explicit**: it only runs once at the start of a -solve (`ExecutionSolver.build_from_db`), and only when the on-disk copy is -missing or older than 6 hours. -Read-only commands like `ref providers list` -or `ref datasets list` never touch the network. +Defaults to `config/default_grey_list.yaml` on the `main` branch +of the `Climate-REF/climate-ref` GitHub repository. +Override this to point at a fork or internal mirror. + +The download is **lazy and explicit**: +it only runs once at the start of a solve (`ExecutionSolver.build_from_db`), +and only when the on-disk copy is missing or older than 6 hours. +Read-only commands like `ref providers list` or `ref datasets list` never touch the network. + +If a refresh fails and no cached file exists, +the solver raises `GreyListRefreshError` rather than running with an empty grey list. +If a cached file exists it is reused. ### Offline / air-gapped use (HPC) -To run completely offline — for example on an HPC compute node with no -outbound network — set the URL to an empty value: +To run completely offline — for example on an HPC compute node with no outbound network — +set the URL to an empty value: ```bash export REF_GREY_LIST_URL= @@ -151,13 +153,11 @@ or in `ref.toml`: grey_list_url = "" ``` -When fetching is disabled the solver simply uses whatever file is at -`grey_list_file`. -A missing file is treated as an empty grey list, so you -do not have to seed the file by hand; -if you want to apply a specific -grey list, either copy `config/default_grey_list.yaml` from the repository -to your `grey_list_file` location ahead of time, +When fetching is disabled the solver simply uses whatever file is at `grey_list_file`. +A missing file is treated as an empty grey list, +so you do not have to seed the file by hand. +If you want to apply a specific grey list, +either copy `config/default_grey_list.yaml` from the repository to your `grey_list_file` location ahead of time, or fetch it once before disabling the URL on the compute nodes. ## Configuration Options diff --git a/packages/climate-ref-core/src/climate_ref_core/providers.py b/packages/climate-ref-core/src/climate_ref_core/providers.py index b239c7ce2..281794d34 100644 --- a/packages/climate-ref-core/src/climate_ref_core/providers.py +++ b/packages/climate-ref-core/src/climate_ref_core/providers.py @@ -86,9 +86,9 @@ def configure(self, config: Any) -> None: # source_type: # - facet: value # - other_facet: [other_value1, other_value2] - # A missing file is treated as an empty grey list so offline/air-gapped - # users that disable fetching with `grey_list_url=""` can run without - # having to seed the file themselves. + # A missing file is treated as an empty grey list + # so offline/air-gapped users that disable fetching with `grey_list_url=""` + # can run without having to seed the file themselves. if not config.grey_list_file.exists(): grey_list_all: dict[str, Any] = {} else: diff --git a/packages/climate-ref/src/climate_ref/config.py b/packages/climate-ref/src/climate_ref/config.py index 16039b986..7788f51b4 100644 --- a/packages/climate-ref/src/climate_ref/config.py +++ b/packages/climate-ref/src/climate_ref/config.py @@ -327,6 +327,30 @@ def default_providers() -> list[DiagnosticProviderConfig]: ] +# TODO(1.0.0): drop the deprecated ignore_datasets_file / REF_IGNORE_DATASETS_FILE +# migration shim below (constants, _check_deprecated_grey_list_config, and the +# associated tests). +_DEPRECATED_IGNORE_DATASETS_KEY = "ignore_datasets_file" +_DEPRECATED_IGNORE_DATASETS_ENV = f"{env_prefix}_IGNORE_DATASETS_FILE" +_GREY_LIST_MIGRATION_HINT = ( + "`ignore_datasets_file` / `REF_IGNORE_DATASETS_FILE` has been renamed to " + "`grey_list_file` / `REF_GREY_LIST_FILE`. Update your configuration accordingly. " + "Set `REF_GREY_LIST_URL=` (empty) to disable automatic refresh if you previously " + "managed the file manually." +) + + +def _check_deprecated_grey_list_config(config_file: str | Path, doc: dict[str, Any]) -> None: + """Hard-fail when deprecated grey list configuration is detected.""" + problems: list[str] = [] + if _DEPRECATED_IGNORE_DATASETS_KEY in doc: + problems.append(f"`{_DEPRECATED_IGNORE_DATASETS_KEY}` found in configuration file {config_file}") + if os.environ.get(_DEPRECATED_IGNORE_DATASETS_ENV) is not None: + problems.append(f"`{_DEPRECATED_IGNORE_DATASETS_ENV}` environment variable is set") + if problems: + raise ValueError("; ".join(problems) + ". " + _GREY_LIST_MIGRATION_HINT) + + def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config": # Try loading the configuration with strict validation try: @@ -341,6 +365,10 @@ def _load_config(config_file: str | Path, doc: dict[str, Any]) -> "Config": return _converter_defaults_relaxed.structure(doc, Config) +class GreyListRefreshError(RuntimeError): + """Raised when the grey list cannot be refreshed and no cached copy is available.""" + + DEFAULT_GREY_LIST_MAX_AGE = datetime.timedelta(hours=6) DEFAULT_GREY_LIST_URL = ( "https://raw.githubusercontent.com/Climate-REF/climate-ref/refs/heads/main/config/default_grey_list.yaml" @@ -352,8 +380,8 @@ def _default_grey_list_path() -> Path: Lives under `REF_CONFIGURATION` (alongside `ref.toml`, the database, etc.) so it inherits the same writable directory the user already has set up, - rather than the user cache directory which is often read-only on - container/HPC deployments. + rather than the user cache directory + which is often read-only on container/HPC deployments. """ return env.path("REF_CONFIGURATION").resolve() / "grey_list.yaml" @@ -367,19 +395,30 @@ def refresh_grey_list_file( """ Download the grey list file to ``path`` from ``url`` if it is missing or stale. - If the file already exists and was modified within ``max_age``, no network - request is made. On network failure an empty file is created so that - downstream YAML parsing does not blow up; an existing file is left untouched. + If the file already exists and was modified within ``max_age``, + no network request is made. + + On network failure: + if a cached file already exists at ``path`` it is left untouched + (and used as the last-known-good copy). + If no cached file exists a `GreyListRefreshError` is raised + so the solver fails loudly rather than silently running without grey list protections. Parameters ---------- path - Destination path for the grey list file. Parent directories are created - as needed so that user-supplied paths (e.g. a writable k8s volume) work. + Destination path for the grey list file. + Parent directories are created as needed + so that user-supplied paths (e.g. a writable k8s volume) work. url URL to fetch the grey list from. max_age Maximum age of an existing file before it is considered stale. + + Raises + ------ + GreyListRefreshError + If the download fails and no cached file exists at ``path``. """ path.parent.mkdir(parents=True, exist_ok=True) @@ -394,9 +433,14 @@ def refresh_grey_list_file( response = requests.get(url, timeout=120) response.raise_for_status() except requests.RequestException as exc: - logger.warning(f"Failed to download grey list: {exc}") - if not path.exists(): - path.touch() + if path.exists(): + logger.warning(f"Failed to refresh grey list from {url}; using cached copy at {path}: {exc}") + return + raise GreyListRefreshError( + f"Failed to download grey list from {url} and no cached copy exists at {path}. " + "Set `REF_GREY_LIST_URL=` (empty) to disable automatic refresh and use a " + f"manually managed file. Original error: {exc}" + ) from exc else: with path.open(mode="wb") as file: file.write(response.content) @@ -464,17 +508,17 @@ class Config: ``` Defaults to a path under the configuration directory. - This location must be writable as the solver may refresh the file from `grey_list_url` - if it is missing or stale. + This location must be writable + as the solver may refresh the file from `grey_list_url` if it is missing or stale. """ grey_list_url: str = env_field("GREY_LIST_URL", default=DEFAULT_GREY_LIST_URL) """ URL to fetch the grey list from at solve time. - Set to an empty string (e.g. `REF_GREY_LIST_URL=`) to disable fetching - entirely; in that case the solver will use whatever file already exists at - `grey_list_file`. Override the URL to point at a fork or mirror. + Set to an empty string (e.g. `REF_GREY_LIST_URL=`) to disable fetching entirely; + in that case the solver will use whatever file already exists at `grey_list_file`. + Override the URL to point at a fork or mirror. """ paths: PathConfig = Factory(PathConfig) @@ -511,6 +555,8 @@ def load(cls, config_file: Path, allow_missing: bool = True) -> "Config": doc = TOMLDocument() raw = None + _check_deprecated_grey_list_config(config_file, doc) + try: config = _load_config(config_file, doc) except Exception as exc: diff --git a/packages/climate-ref/tests/unit/test_config.py b/packages/climate-ref/tests/unit/test_config.py index fca044b84..aec872723 100644 --- a/packages/climate-ref/tests/unit/test_config.py +++ b/packages/climate-ref/tests/unit/test_config.py @@ -14,6 +14,7 @@ DEFAULT_GREY_LIST_URL, DEFAULT_LOG_FORMAT, Config, + GreyListRefreshError, PathConfig, refresh_grey_list_file, transform_error, @@ -296,20 +297,21 @@ def test_refresh_grey_list_file(mocker, tmp_path, status): assert target.read_text(encoding="utf-8") == "downloaded" -def test_refresh_grey_list_file_fail(mocker, tmp_path): +def test_refresh_grey_list_file_fail_no_cache(mocker, tmp_path): + """A download failure with no cached copy must raise rather than create an empty placeholder.""" result = mocker.MagicMock(status_code=404, content=b"{}") result.raise_for_status.side_effect = requests.RequestException mocker.patch.object(climate_ref.config.requests, "get", return_value=result) target = tmp_path / "grey_list.yaml" - refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") + with pytest.raises(GreyListRefreshError, match="no cached copy"): + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") - assert target.exists() - assert target.read_text(encoding="utf-8") == "" + assert not target.exists() -def test_refresh_grey_list_file_network_error(mocker, tmp_path): - """Network errors (e.g. offline/HPC environment) are handled gracefully.""" +def test_refresh_grey_list_file_network_error_no_cache(mocker, tmp_path): + """Network errors with no cached copy must raise (fail-safe).""" mocker.patch.object( climate_ref.config.requests, "get", @@ -317,10 +319,24 @@ def test_refresh_grey_list_file_network_error(mocker, tmp_path): ) target = tmp_path / "grey_list.yaml" - refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") + with pytest.raises(GreyListRefreshError): + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml") - assert target.exists() - assert target.read_text(encoding="utf-8") == "" + assert not target.exists() + + +def test_refresh_grey_list_file_fail_uses_stale_cache(mocker, tmp_path): + """A download failure must preserve and reuse an existing cached copy.""" + result = mocker.MagicMock(status_code=500, content=b"{}") + result.raise_for_status.side_effect = requests.RequestException + mocker.patch.object(climate_ref.config.requests, "get", return_value=result) + + target = tmp_path / "grey_list.yaml" + target.write_text("cached", encoding="utf-8") + + refresh_grey_list_file(target, "https://example.invalid/grey_list.yaml", max_age=timedelta(seconds=-1)) + + assert target.read_text(encoding="utf-8") == "cached" def test_config_default_does_not_fetch(mocker): @@ -330,3 +346,22 @@ def test_config_default_does_not_fetch(mocker): Config.default() get_mock.assert_not_called() + + +def test_load_rejects_deprecated_ignore_datasets_file(tmp_path, monkeypatch): + """Old `ignore_datasets_file` toml key must hard-fail with a migration hint.""" + monkeypatch.delenv("REF_IGNORE_DATASETS_FILE", raising=False) + config_file = tmp_path / "ref.toml" + config_file.write_text('ignore_datasets_file = "old.yaml"\n', encoding="utf-8") + + with pytest.raises(ValueError, match="ignore_datasets_file"): + Config.load(config_file) + + +def test_load_rejects_deprecated_ignore_datasets_env(tmp_path, monkeypatch): + """`REF_IGNORE_DATASETS_FILE` env var must hard-fail with a migration hint.""" + monkeypatch.setenv("REF_IGNORE_DATASETS_FILE", str(tmp_path / "old.yaml")) + config_file = tmp_path / "ref.toml" + + with pytest.raises(ValueError, match="REF_IGNORE_DATASETS_FILE"): + Config.load(config_file)