feat: better metric strategy(#1473)

Nexisato · SAKURA-CAT · web-flow · commit 5b6a7560d0b5 · 2026-03-19T16:23:34.000+08:00
* feat: support overwrite metric * fix: overwrite existing if explicitly set step * feat: support overwrite under any circumstance * fix: step auto increment * Support metric overwrite and preserve epochs Allow duplicated steps to overwrite existing metric entries while preserving the original epoch mapping. Key changes: add metric_overwrite flag to MetricInfo, maintain per-key _step_epochs and _step_summary_values in SwanLabKey, rebuild summaries on overwrite, and update in-memory collections to replace existing step entries. LocalRunCallback now rewrites log slice files when overwriting (helper _rewrite_metric_file), and several docstrings/log levels updated to reflect overwrite semantics. Tests updated to assert overwrite behavior and epoch preservation. * fix: memory reduce * chore: update warning * refactor: rebuild by step * feat: add parallel run support and update run_id validation (#1491) * Add parallel run support and update run_id validation Introduce a parallel run option and related env var, propagate slug usage, and relax run_id validation: - Add a `parallel` parameter to SwanLabInitializer (and SWANLAB_RUN_PARALLEL env) to enable shared parallel runs; when enabled it forces mode='cloud', resume='allow', and generates an id if missing. - Load `parallel` from config/env and validate it during initialization; minor warning/formatting tweaks. - Add ExperimentInfo.slug property and use it in Client.web_exp_url to prefer exp.slug over exp_id when available. - Update run_id validation: allow lengths 1–64 and disallow characters '/ \ # ? % :', with corresponding updates to tests. - Add missing import (random) required for id generation. Tests updated to reflect new run_id rules and additional valid/invalid cases. * Update sdk.py * Update sdk.py * feat: support overwrite metric * fix: overwrite existing if explicitly set step * feat: support overwrite under any circumstance * fix: step auto increment * Support metric overwrite and preserve epochs Allow duplicated steps to overwrite existing metric entries while preserving the original epoch mapping. Key changes: add metric_overwrite flag to MetricInfo, maintain per-key _step_epochs and _step_summary_values in SwanLabKey, rebuild summaries on overwrite, and update in-memory collections to replace existing step entries. LocalRunCallback now rewrites log slice files when overwriting (helper _rewrite_metric_file), and several docstrings/log levels updated to reflect overwrite semantics. Tests updated to assert overwrite behavior and epoch preservation. * fix: memory reduce * chore: update warning * refactor: rebuild by step --------- Co-authored-by: Kang Li <79990647+SAKURA-CAT@users.noreply.github.com>
diff --git a/swanlab/data/callbacker/local.py b/swanlab/data/callbacker/local.py
@@ -35,6 +35,7 @@
 
 import json
 import os
+import tempfile
 from datetime import datetime
 from typing import Tuple, Optional, TextIO
 from swanlab.toolkit import RuntimeInfo, MetricInfo
@@ -130,11 +131,50 @@ def on_metric_create(self, metric_info: MetricInfo, *args, **kwargs):
         os.makedirs(os.path.dirname(metric_info.summary_file_path), exist_ok=True)
         with open(metric_info.summary_file_path, "w+", encoding="utf-8") as f:
             f.write(json.dumps(metric_info.metric_summary, ensure_ascii=False))
-        with open(metric_info.metric_file_path, "a", encoding="utf-8") as f:
-            f.write(json.dumps(metric_info.metric, ensure_ascii=False) + "\n")
+        if metric_info.metric_overwrite:
+            self._rewrite_metric_file(metric_info)
+        else:
+            with open(metric_info.metric_file_path, "a", encoding="utf-8") as f:
+                f.write(json.dumps(metric_info.metric, ensure_ascii=False) + "\n")
         # ---------------------------------- 保存媒体字节流数据 ----------------------------------
         self.porter.trace_metric(metric_info)
 
+    @staticmethod
+    def _rewrite_metric_file(metric_info: MetricInfo) -> None:
+        serialized = json.dumps(metric_info.metric, ensure_ascii=False) + "\n"
+        metric_path = metric_info.metric_file_path
+
+        try:
+            f_in = open(metric_path, "r", encoding="utf-8")
+        except FileNotFoundError:
+            with open(metric_path, "w", encoding="utf-8") as f:
+                f.write(serialized)
+            return
+
+        dir_path = os.path.dirname(metric_path)
+        with f_in, tempfile.NamedTemporaryFile(mode="w", encoding="utf-8", dir=dir_path, delete=False) as tmp:
+            tmp_path = tmp.name
+            replaced = False
+
+            for line in f_in:
+                try:
+                    existing = json.loads(line)
+                except json.JSONDecodeError:
+                    swanlog.warning(f"Failed to decode JSON from line in {metric_path}: {line.strip()}")
+                    tmp.write(line)
+                    continue
+
+                if existing.get("index") != metric_info.metric_step:
+                    tmp.write(line)
+                elif not replaced:
+                    tmp.write(serialized)
+                    replaced = True
+
+            if not replaced:
+                tmp.write(serialized)
+
+        os.replace(tmp_path, metric_path)
+
     def on_stop(self, error: str = None, *args, **kwargs):
         """
         训练结束，取消系统回调
diff --git a/swanlab/data/run/exp.py b/swanlab/data/run/exp.py
@@ -124,19 +124,29 @@ def _add(
         key_obj: SwanLabKey = self._keys.get(key_index, None)
 
         # ---------------------------------- 包装器解析 ----------------------------------
-
+        explicit_step = step is not None
         if step is not None and not isinstance(step, int):
             swanlog.warning(f"Step {step} is not int, SwanLab will set it automatically.")
             step = None
+            explicit_step = False
+
         if key_obj is None:
-            step = 0 if step is None or not isinstance(step, int) else step
+            step = 0 if step is None else step
         else:
-            step = len(key_obj.steps) if step is None else step
+            if step is None:
+                # 修复隐式步数的无限覆盖和乱序陷阱：
+                # 若曾跨步长显式写入，len() 可能会落后于真实的 max step，由此引发相同 step 的持续覆盖
+                current_len = len(key_obj.steps)
+                max_step = max(key_obj.steps) if key_obj.steps else -1
+                step = max(current_len, max_step + 1)
+
             if step in key_obj.steps:
-                swanlog.debug(f"Step {step} on key {key} already exists, ignored.")
-                return MetricErrorInfo(column_info=key_obj.column_info, error=DataWrapper.create_duplicate_error())
+                # 允许 overwrite，但区分显式指定和隐式的碰撞
+                if explicit_step:
+                    swanlog.debug(f"Step {step} on key {key} already exists, overwriting.")
+                else:
+                    swanlog.warning(f"Implicit step {step} on key {key} resolved as overwrite, but expected to append.")
         data.parse(step=step, key=key)
-
         # ---------------------------------- 图表创建 ----------------------------------
 
         if key_obj is None:
diff --git a/swanlab/data/run/key.py b/swanlab/data/run/key.py
@@ -7,7 +7,7 @@
 
 import json
 import math
-from typing import Optional, Tuple
+from typing import Dict, Optional, Tuple
 
 from swanlab.data.modules import DataWrapper, Line
 from swanlab.env import create_time
@@ -42,6 +42,10 @@ def __init__(
         self.key = key
         # 当前 key 包含的 step
         self.steps = set()
+        # 当前 key 的 step 与首次写入 epoch 的映射，重复 step 覆盖时需要保持 epoch 不变
+        self._step_epochs: Dict[int, int] = {}
+        # 当前 key 的 step 与摘要值的映射，用于覆盖时重建 summary
+        self._step_summary_values: Dict[int, Optional[object]] = {}
         self.column_info: Optional[ColumnInfo] = None
         self._media_dir = media_dir
         self._log_dir = log_dir
@@ -110,22 +114,24 @@ def add(self, data: DataWrapper) -> MetricInfo:
         # 4. 更新 summary 并添加数据
         # 如果为Line且为NaN或者INF，不更新summary
         r = result.strings or result.float
-        if not data.type == Line or r not in [Line.nan, Line.inf]:
-            if self._summary.get("max") is None or r > self._summary["max"]:
-                self._summary["max"] = r
-                self._summary["max_step"] = result.step
-            if self._summary.get("min") is None or r < self._summary["min"]:
-                self._summary["min"] = r
-                self._summary["min_step"] = result.step
-        self._summary["num"] = self._summary.get("num", 0) + 1
-        self.steps.add(result.step)
-        swanlog.debug(f"Add data, key: {self.key}, step: {result.step}, data: {r}")
-        if len(self._collection["data"]) >= self.__slice_size:
-            self._collection = self.__new_metric_collection()
-
+        overwrite = result.step in self._step_epochs
+        if not overwrite:
+            self.steps.add(result.step)
+            self._step_epochs[result.step] = len(self.steps)
+        epoch = self._step_epochs[result.step]
         new_data = self.__new_metric(result.step, r, more=result.more)
-        self._collection["data"].append(new_data)
-        epoch = len(self.steps)
+
+        # 覆盖写入时，只有当前 step 持有的 extremum 被削弱/移除时才需要全量重建。
+        needs_rebuild = overwrite and self._should_rebuild_summary_on_overwrite(result.step, data.type, r)
+        self._set_summary_value(result.step, data.type, r)
+        if needs_rebuild:
+            self._rebuild_summary()
+        else:
+            self._update_summary_incremental(result.step, data.type, r)
+        self._update_collection(new_data, result.step, overwrite)
+        swanlog.debug(
+            f"{'Overwrite' if overwrite else 'Add'} data, key: {self.key}, step: {result.step}, data: {r}"
+        )
         mu = math.ceil(epoch / self.__slice_size)
         return MetricInfo(
             column_info=self.column_info,
@@ -137,8 +143,67 @@ def add(self, data: DataWrapper) -> MetricInfo:
             metric_file_name=str(mu * self.__slice_size) + ".log",
             swanlab_logdir=self._log_dir,
             swanlab_media_dir=self._media_dir if result.buffers else None,
+            metric_overwrite=overwrite,
         )
 
+    def _set_summary_value(self, step: int, data_type, value) -> None:
+        if data_type == Line and value in [Line.nan, Line.inf]:
+            self._step_summary_values[step] = None
+            return
+        self._step_summary_values[step] = value
+
+    def _should_rebuild_summary_on_overwrite(self, step: int, data_type, value) -> bool:
+        max_step = self._summary.get("max_step")
+        min_step = self._summary.get("min_step")
+        if data_type == Line and value in [Line.nan, Line.inf]:
+            return step == max_step or step == min_step
+
+        current_max = self._summary.get("max")
+        if step == max_step and current_max is not None and value < current_max:
+            return True
+
+        current_min = self._summary.get("min")
+        if step == min_step and current_min is not None and value > current_min:
+            return True
+
+        return False
+
+    def _update_summary_incremental(self, step: int, data_type, value) -> None:
+        if data_type == Line and value in [Line.nan, Line.inf]:
+            return
+        if self._summary.get("max") is None or value > self._summary["max"]:
+            self._summary["max"] = value
+            self._summary["max_step"] = step
+        if self._summary.get("min") is None or value < self._summary["min"]:
+            self._summary["min"] = value
+            self._summary["min_step"] = step
+        self._summary["num"] = len(self.steps)
+
+    def _rebuild_summary(self) -> None:
+        summary = {"num": len(self.steps)}
+        for step, _epoch in sorted(self._step_epochs.items(), key=lambda item: item[1]):
+            value = self._step_summary_values.get(step)
+            if value is None:
+                continue
+            if summary.get("max") is None or value > summary["max"]:
+                summary["max"] = value
+                summary["max_step"] = step
+            if summary.get("min") is None or value < summary["min"]:
+                summary["min"] = value
+                summary["min_step"] = step
+        self._summary = summary
+
+    def _update_collection(self, new_data: dict, step: int, overwrite: bool) -> None:
+        for idx, item in enumerate(self._collection["data"]):
+            if item["index"] == step:
+                self._collection["data"][idx] = new_data
+                return
+        if overwrite:
+            return
+        if len(self._collection["data"]) >= self.__slice_size:
+            self._collection = self.__new_metric_collection()
+        self._collection["data"].append(new_data)
+
     def create_column(
         self,
         key: str,
@@ -301,8 +366,9 @@ def mock_from_remote(
             section_type=section_type,
         )
         key_obj.column_info = column_info
-        # 5. 设置当前步数，resume 后不允许设置历史步数，所以需要覆盖
+        # 5. 「同步云端最新 step」设置当前步数，resume 后不允许设置历史步数，所以需要覆盖
         if step is not None:
             for i in range(step + 1):
                 key_obj.steps.add(i)
+                key_obj._step_epochs[i] = i + 1
         return key_obj, column_info
diff --git a/swanlab/data/run/main.py b/swanlab/data/run/main.py
@@ -272,7 +272,7 @@ def log(self, data: dict, step: int = None):
             For nested dicts, keys will be joined with dots (e.g., {'a': {'b': 1}} becomes {'a.b': 1}).
         step : int, optional
             The step number of the current data, if not provided, it will be automatically incremented.
-            If step is duplicated, the data will be ignored.
+            If step is duplicated, the latest data will overwrite the previous data on that step.
 
         Raises
         ----------
diff --git a/swanlab/data/sdk.py b/swanlab/data/sdk.py
@@ -460,7 +460,7 @@ def log(
         The value must be a `float`, `float convertible object`, `int` or `swanlab.data.BaseType`.
     step : int, optional
         The step number of the current data, if not provided, it will be automatically incremented.
-        If step is duplicated, the data will be ignored.
+        If step is duplicated, the latest data will overwrite the previous data on that step.
     print_to_console : bool, optional
         Whether to print the data to the console, the default is False.
     """
diff --git a/swanlab/integration/accelerate.py b/swanlab/integration/accelerate.py
@@ -104,7 +104,7 @@ def log(self, values: dict, step: Optional[int] = None, **kwargs):
             The value must be a `float`, `float convertible object`, `int` or `swanlab.data.BaseType`.
         step : int, optional
             The step number of the current data, if not provided, it will be automatically incremented.
-        If step is duplicated, the data will be ignored.
+            If step is duplicated, the latest data will overwrite the previous data on that step.
             kwargs:
                 Additional key word arguments passed along to the `swanlab.log` method. Likes:
                     print_to_console : bool, optional
diff --git a/swanlab/toolkit/models/metric.py b/swanlab/toolkit/models/metric.py
@@ -171,6 +171,7 @@ def __init__(
         metric_file_name: Optional[str],
         swanlab_logdir: Optional[str],
         swanlab_media_dir: Optional[str],
+        metric_overwrite: bool = False,
         error: Optional[ParseErrorInfo] = None,
     ):
         """
@@ -184,6 +185,7 @@ def __init__(
         :param metric_file_name: 此指标的文件名
         :param swanlab_logdir: swanlab在本次实验的log文件夹路径
         :param swanlab_media_dir: swanlab在本次实验的media文件夹路径
+        :param metric_overwrite: 当前指标是否覆盖了已有 step
         :param error: 创建此指标时的错误信息
         """
         self.error = error
@@ -193,6 +195,7 @@ def __init__(
         self.metric_summary = metric_summary
         self.metric_step = metric_step
         self.metric_epoch = metric_epoch
+        self.metric_overwrite = metric_overwrite
         _id = self.column_info.kid
         self.metric_file_path = None if self.is_error else os.path.join(swanlab_logdir, _id, metric_file_name)
         self.summary_file_path = None if self.is_error else os.path.join(swanlab_logdir, _id, self.__SUMMARY_NAME)
@@ -252,5 +255,6 @@ def __init__(self, column_info: ColumnInfo, error: ParseErrorInfo):
             None,
             None,
             None,
+            False,
             error,
         )
diff --git a/test/metrics/echarts/calendar_test.py b/test/metrics/echarts/calendar_test.py
@@ -5,6 +5,7 @@
 // @description: 本文件是对于echarts的 Calendar 图表测试 , 文件名不叫calendar是因为和库文件重名
 """
 # ---------------------------------------------- Calendar - Calendar_heatmap ----------------------------------------------
+import tutils as T
 import random
 import datetime
 
diff --git a/test/unit/data/run/test_key.py b/test/unit/data/run/test_key.py
diff --git a/test/unit/data/run/test_main.py b/test/unit/data/run/test_main.py
diff --git a/tutils/check.py b/tutils/check.py