From 430f8e4e9357072fc084d61fe8523e37b1e9c318 Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Thu, 25 Jun 2026 02:19:33 +0530 Subject: [PATCH 01/10] feat(smartdiff): semantic subtitle diff core (SRT parser + classifier) Add a Flask-decoupled smart-diff module that classifies *how* two subtitle outputs differ instead of producing a raw line diff: - srt.py: parse SubRip content into structured cues (BOM/CRLF tolerant). - compare.py: align cues and classify as identical, timing_shift (with a consistent offset_ms), text_change, missing_cues, extra_cues, or mixed, with an agent-actionable one-line summary. Includes unit tests for the parser and every classification branch. --- mod_test/smartdiff/__init__.py | 7 ++ mod_test/smartdiff/compare.py | 113 +++++++++++++++++++++++++++ mod_test/smartdiff/srt.py | 82 +++++++++++++++++++ tests/test_smartdiff/__init__.py | 1 + tests/test_smartdiff/test_compare.py | 74 ++++++++++++++++++ tests/test_smartdiff/test_srt.py | 48 ++++++++++++ 6 files changed, 325 insertions(+) create mode 100644 mod_test/smartdiff/__init__.py create mode 100644 mod_test/smartdiff/compare.py create mode 100644 mod_test/smartdiff/srt.py create mode 100644 tests/test_smartdiff/__init__.py create mode 100644 tests/test_smartdiff/test_compare.py create mode 100644 tests/test_smartdiff/test_srt.py diff --git a/mod_test/smartdiff/__init__.py b/mod_test/smartdiff/__init__.py new file mode 100644 index 00000000..7143d27c --- /dev/null +++ b/mod_test/smartdiff/__init__.py @@ -0,0 +1,7 @@ +"""Semantic ("smart") diff for subtitle regression outputs. + +Unlike a raw line diff, this package classifies *how* two outputs differ +(timing shift, text change, missing/extra cues) so a person or an agent gets an +actionable answer instead of a wall of changed lines. Pure/Flask-decoupled so it +is fully unit-testable. +""" diff --git a/mod_test/smartdiff/compare.py b/mod_test/smartdiff/compare.py new file mode 100644 index 00000000..90154d4f --- /dev/null +++ b/mod_test/smartdiff/compare.py @@ -0,0 +1,113 @@ +"""Semantic comparison of subtitle outputs: classify *how* two results differ.""" + +from typing import Dict, List, Optional + +from mod_test.smartdiff.srt import parse_srt + + +def _norm(text: str) -> str: + """ + Normalise cue text for comparison (collapse whitespace, case-fold). + + :param text: Raw cue text. + :type text: str + :return: Normalised text. + :rtype: str + """ + return ' '.join(text.split()).casefold() + + +def _result(kind: str, summary: str, n_exp: int, n_act: int, + offset_ms: Optional[int] = None) -> Dict[str, object]: + """ + Build a classification result dict. + + :param kind: The stable difference kind. + :type kind: str + :param summary: A human/agent-readable one-line explanation. + :type summary: str + :param n_exp: Number of expected cues. + :type n_exp: int + :param n_act: Number of actual cues. + :type n_act: int + :param offset_ms: Consistent timing offset, when ``kind`` is ``timing_shift``. + :type offset_ms: Optional[int] + :return: The classification result. + :rtype: Dict[str, object] + """ + out: Dict[str, object] = { + 'kind': kind, + 'summary': summary, + 'expected_cues': n_exp, + 'actual_cues': n_act, + } + if offset_ms is not None: + out['offset_ms'] = offset_ms + return out + + +def smart_diff(expected: str, actual: str) -> Dict[str, object]: + """ + Compare expected vs actual SubRip output and classify the difference. + + Aligns cues by position and reports the *kind* of difference rather than a + raw line diff: ``identical``, ``timing_shift`` (with a consistent offset), + ``text_change``, ``missing_cues``, ``extra_cues``, or ``mixed``. The goal is + an actionable answer ("subtitles are +120 ms late") instead of a wall of + changed lines. + + :param expected: The expected/baseline .srt content. + :type expected: str + :param actual: The actual/produced .srt content. + :type actual: str + :return: A classification dict with keys ``kind``, ``summary``, + ``expected_cues``, ``actual_cues`` and (for ``timing_shift``) ``offset_ms``. + :rtype: Dict[str, object] + """ + exp = parse_srt(expected) + act = parse_srt(actual) + n_exp, n_act = len(exp), len(act) + count_mismatch = n_exp != n_act + + text_changes = 0 + timing_deltas: List[int] = [] + for e, a in zip(exp, act): + if _norm(e.text) != _norm(a.text): + text_changes += 1 + else: + timing_deltas.append(a.start_ms - e.start_ms) + + if not count_mismatch and text_changes == 0 and all(d == 0 for d in timing_deltas): + return _result('identical', 'Outputs are identical.', n_exp, n_act) + + uniform_shift = bool(timing_deltas) and len(set(timing_deltas)) == 1 + if not count_mismatch and text_changes == 0 and uniform_shift and timing_deltas[0] != 0: + offset = timing_deltas[0] + direction = 'late' if offset > 0 else 'early' + return _result( + 'timing_shift', + f'All {n_exp} cues match but are {abs(offset)} ms {direction}.', + n_exp, n_act, offset_ms=offset) + + if count_mismatch and text_changes == 0: + if n_act < n_exp: + return _result( + 'missing_cues', + f'{n_exp - n_act} of {n_exp} cues are missing from the output.', + n_exp, n_act) + return _result( + 'extra_cues', + f'Output has {n_act - n_exp} extra cues ({n_act} vs {n_exp} expected).', + n_exp, n_act) + + if not count_mismatch and text_changes > 0 and all(d == 0 for d in timing_deltas): + return _result( + 'text_change', + f'{text_changes} of {n_exp} cues differ in text only (timing matches).', + n_exp, n_act) + + return _result( + 'mixed', + f'Mixed differences: {text_changes} text change(s) across ' + f'{min(n_exp, n_act)} compared cues; expected {n_exp}, got {n_act}.', + n_exp, n_act) diff --git a/mod_test/smartdiff/srt.py b/mod_test/smartdiff/srt.py new file mode 100644 index 00000000..427b70cc --- /dev/null +++ b/mod_test/smartdiff/srt.py @@ -0,0 +1,82 @@ +"""Parse SubRip (.srt) subtitle output into structured cues for comparison.""" + +import re +from dataclasses import dataclass +from typing import List, Optional + +_TIMING_RE = re.compile( + r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})\s*-->\s*' + r'(\d{1,2}):(\d{2}):(\d{2})[,.](\d{1,3})' +) + + +@dataclass +class Cue: + """ + A single subtitle cue: its timing window and text. + + :param index: The cue's sequence number as written in the file. + :type index: int + :param start_ms: Start time in milliseconds. + :type start_ms: int + :param end_ms: End time in milliseconds. + :type end_ms: int + :param text: The cue's text, newlines preserved and surrounding whitespace stripped. + :type text: str + """ + + index: int + start_ms: int + end_ms: int + text: str + + +def _to_ms(hours: str, minutes: str, seconds: str, millis: str) -> int: + """ + Convert the parts of an SRT timestamp into total milliseconds. + + :param hours: Hours component. + :type hours: str + :param minutes: Minutes component. + :type minutes: str + :param seconds: Seconds component. + :type seconds: str + :param millis: Milliseconds component. + :type millis: str + :return: The timestamp in milliseconds. + :rtype: int + """ + return ((int(hours) * 60 + int(minutes)) * 60 + int(seconds)) * 1000 + int(millis) + + +def parse_srt(content: str) -> List[Cue]: + """ + Parse SubRip subtitle text into a list of cues. + + Tolerant of a leading BOM, CRLF/CR line endings, and either ',' or '.' as the + millisecond separator. Blocks without a valid timing line are skipped. + + :param content: Raw .srt file content. + :type content: str + :return: The parsed cues, in file order. + :rtype: List[Cue] + """ + content = content.lstrip('').replace('\r\n', '\n').replace('\r', '\n') + cues: List[Cue] = [] + for block in re.split(r'\n[ \t]*\n', content.strip()): + lines = block.split('\n') + timing_idx: Optional[int] = next( + (i for i, ln in enumerate(lines) if _TIMING_RE.search(ln)), None) + if timing_idx is None: + continue + match = _TIMING_RE.search(lines[timing_idx]) + if match is None: # pragma: no cover - guaranteed by the search above + continue + start_ms = _to_ms(match.group(1), match.group(2), match.group(3), match.group(4)) + end_ms = _to_ms(match.group(5), match.group(6), match.group(7), match.group(8)) + index = len(cues) + 1 + if timing_idx > 0 and lines[timing_idx - 1].strip().isdigit(): + index = int(lines[timing_idx - 1].strip()) + text = '\n'.join(lines[timing_idx + 1:]).strip() + cues.append(Cue(index=index, start_ms=start_ms, end_ms=end_ms, text=text)) + return cues diff --git a/tests/test_smartdiff/__init__.py b/tests/test_smartdiff/__init__.py new file mode 100644 index 00000000..68bc9996 --- /dev/null +++ b/tests/test_smartdiff/__init__.py @@ -0,0 +1 @@ +"""Tests for the smart-diff subtitle comparison.""" diff --git a/tests/test_smartdiff/test_compare.py b/tests/test_smartdiff/test_compare.py new file mode 100644 index 00000000..9d60a82b --- /dev/null +++ b/tests/test_smartdiff/test_compare.py @@ -0,0 +1,74 @@ +"""Tests for the semantic subtitle comparison / classifier.""" + +import unittest + +from mod_test.smartdiff.compare import smart_diff + + +def _srt(cues): + """ + Build SubRip text from (start_ms, end_ms, text) tuples. + + :param cues: Iterable of (start_ms, end_ms, text) tuples. + :type cues: list + :return: SubRip-formatted string. + :rtype: str + """ + def stamp(ms): + h, ms = divmod(ms, 3600000) + m, ms = divmod(ms, 60000) + s, ms = divmod(ms, 1000) + return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}" + + blocks = [] + for i, (start, end, text) in enumerate(cues, start=1): + blocks.append(f"{i}\n{stamp(start)} --> {stamp(end)}\n{text}\n") + return "\n".join(blocks) + + +_BASE = [(1000, 4000, "Hello world"), (5000, 8000, "Second line")] + + +class SmartDiffTests(unittest.TestCase): + """Classifying the kind of difference between two outputs.""" + + def test_identical(self): + """Equal outputs classify as identical.""" + result = smart_diff(_srt(_BASE), _srt(_BASE)) + self.assertEqual(result["kind"], "identical") + + def test_timing_shift_reports_offset(self): + """A constant timing offset is reported as timing_shift with offset_ms.""" + shifted = [(s + 500, e + 500, t) for s, e, t in _BASE] + result = smart_diff(_srt(_BASE), _srt(shifted)) + self.assertEqual(result["kind"], "timing_shift") + self.assertEqual(result["offset_ms"], 500) + + def test_text_change_only(self): + """Same timing, different text classifies as text_change.""" + changed = [(1000, 4000, "Hello world"), (5000, 8000, "DIFFERENT")] + result = smart_diff(_srt(_BASE), _srt(changed)) + self.assertEqual(result["kind"], "text_change") + + def test_missing_cues(self): + """Fewer cues than expected classifies as missing_cues.""" + result = smart_diff(_srt(_BASE), _srt(_BASE[:1])) + self.assertEqual(result["kind"], "missing_cues") + self.assertEqual((result["expected_cues"], result["actual_cues"]), (2, 1)) + + def test_extra_cues(self): + """More cues than expected classifies as extra_cues.""" + more = _BASE + [(9000, 10000, "Third line")] + result = smart_diff(_srt(_BASE), _srt(more)) + self.assertEqual(result["kind"], "extra_cues") + + def test_mixed_when_text_and_count_differ(self): + """Both text changes and a count mismatch classify as mixed.""" + other = [(1000, 4000, "CHANGED"), (5000, 8000, "Second line"), + (9000, 10000, "Third")] + result = smart_diff(_srt(_BASE), _srt(other)) + self.assertEqual(result["kind"], "mixed") + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_smartdiff/test_srt.py b/tests/test_smartdiff/test_srt.py new file mode 100644 index 00000000..55ed4c9c --- /dev/null +++ b/tests/test_smartdiff/test_srt.py @@ -0,0 +1,48 @@ +"""Tests for the SubRip (.srt) parser.""" + +import unittest + +from mod_test.smartdiff.srt import parse_srt + +_TWO_CUES = ( + "1\n" + "00:00:01,000 --> 00:00:04,000\n" + "Hello world\n" + "\n" + "2\n" + "00:00:05,500 --> 00:00:08,250\n" + "Second line\n" +) + + +class ParseSrtTests(unittest.TestCase): + """Parsing SubRip content into structured cues.""" + + def test_parses_index_timing_and_text(self): + """A two-cue file yields two cues with correct ms timing and text.""" + cues = parse_srt(_TWO_CUES) + self.assertEqual(len(cues), 2) + self.assertEqual((cues[0].index, cues[0].start_ms, cues[0].end_ms), (1, 1000, 4000)) + self.assertEqual(cues[0].text, "Hello world") + self.assertEqual((cues[1].start_ms, cues[1].end_ms), (5500, 8250)) + + def test_tolerates_crlf_and_bom(self): + """CRLF line endings and a leading BOM are handled.""" + cues = parse_srt("" + _TWO_CUES.replace("\n", "\r\n")) + self.assertEqual(len(cues), 2) + self.assertEqual(cues[1].text, "Second line") + + def test_skips_blocks_without_timing(self): + """A trailing junk block with no timing line is ignored.""" + cues = parse_srt(_TWO_CUES + "\nnot a cue\n") + self.assertEqual(len(cues), 2) + + def test_multiline_cue_text_preserved(self): + """Cue text spanning multiple lines is preserved with its newline.""" + content = "1\n00:00:01,000 --> 00:00:02,000\nline one\nline two\n" + cues = parse_srt(content) + self.assertEqual(cues[0].text, "line one\nline two") + + +if __name__ == "__main__": + unittest.main() From 4e8144b0477f36ef88fad882cc6294bb95288b1d Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Thu, 25 Jun 2026 02:29:54 +0530 Subject: [PATCH 02/10] feat(smartdiff): add WebVTT support and a format dispatcher - vtt.py: parse WebVTT into cues (skips WEBVTT/NOTE/STYLE/REGION blocks, handles optional hours and trailing cue settings). - parsing.py: parse_subtitles() picks the parser by explicit hint or by auto-detecting the format from content. - compare.smart_diff() now takes an optional fmt and works across SRT/VTT. Adds parser tests for WebVTT and a cross-format auto-detect compare test. --- mod_test/smartdiff/compare.py | 28 +++++++----- mod_test/smartdiff/parsing.py | 26 +++++++++++ mod_test/smartdiff/vtt.py | 65 ++++++++++++++++++++++++++++ tests/test_smartdiff/test_compare.py | 8 ++++ tests/test_smartdiff/test_vtt.py | 44 +++++++++++++++++++ 5 files changed, 159 insertions(+), 12 deletions(-) create mode 100644 mod_test/smartdiff/parsing.py create mode 100644 mod_test/smartdiff/vtt.py create mode 100644 tests/test_smartdiff/test_vtt.py diff --git a/mod_test/smartdiff/compare.py b/mod_test/smartdiff/compare.py index 90154d4f..a3f9aafb 100644 --- a/mod_test/smartdiff/compare.py +++ b/mod_test/smartdiff/compare.py @@ -2,7 +2,7 @@ from typing import Dict, List, Optional -from mod_test.smartdiff.srt import parse_srt +from mod_test.smartdiff.parsing import parse_subtitles def _norm(text: str) -> str: @@ -46,26 +46,30 @@ def _result(kind: str, summary: str, n_exp: int, n_act: int, return out -def smart_diff(expected: str, actual: str) -> Dict[str, object]: +def smart_diff(expected: str, actual: str, + fmt: Optional[str] = None) -> Dict[str, object]: """ - Compare expected vs actual SubRip output and classify the difference. + Compare expected vs actual subtitle output and classify the difference. - Aligns cues by position and reports the *kind* of difference rather than a - raw line diff: ``identical``, ``timing_shift`` (with a consistent offset), - ``text_change``, ``missing_cues``, ``extra_cues``, or ``mixed``. The goal is - an actionable answer ("subtitles are +120 ms late") instead of a wall of - changed lines. + Supports SubRip (.srt) and WebVTT (.vtt); the format is auto-detected from + content unless ``fmt`` is given. Aligns cues by position and reports the + *kind* of difference rather than a raw line diff: ``identical``, + ``timing_shift`` (with a consistent offset), ``text_change``, + ``missing_cues``, ``extra_cues``, or ``mixed``. The goal is an actionable + answer ("subtitles are +120 ms late") instead of a wall of changed lines. - :param expected: The expected/baseline .srt content. + :param expected: The expected/baseline subtitle content. :type expected: str - :param actual: The actual/produced .srt content. + :param actual: The actual/produced subtitle content. :type actual: str + :param fmt: Explicit format ('srt' or 'vtt'); auto-detected when None. + :type fmt: Optional[str] :return: A classification dict with keys ``kind``, ``summary``, ``expected_cues``, ``actual_cues`` and (for ``timing_shift``) ``offset_ms``. :rtype: Dict[str, object] """ - exp = parse_srt(expected) - act = parse_srt(actual) + exp = parse_subtitles(expected, fmt) + act = parse_subtitles(actual, fmt) n_exp, n_act = len(exp), len(act) count_mismatch = n_exp != n_act diff --git a/mod_test/smartdiff/parsing.py b/mod_test/smartdiff/parsing.py new file mode 100644 index 00000000..35bfe3d6 --- /dev/null +++ b/mod_test/smartdiff/parsing.py @@ -0,0 +1,26 @@ +"""Detect the subtitle format and dispatch to the right parser.""" + +from typing import List, Optional + +from mod_test.smartdiff.srt import Cue, parse_srt +from mod_test.smartdiff.vtt import parse_vtt + + +def parse_subtitles(content: str, fmt: Optional[str] = None) -> List[Cue]: + """ + Parse subtitle content into cues, choosing a parser by hint or by content. + + :param content: Raw subtitle file content. + :type content: str + :param fmt: Explicit format ('srt' or 'vtt'); auto-detected from content when None. + :type fmt: Optional[str] + :return: The parsed cues. + :rtype: List[Cue] + """ + chosen = (fmt or '').lower() + if not chosen: + head = content.lstrip('').lstrip().upper() + chosen = 'vtt' if head.startswith('WEBVTT') else 'srt' + if chosen == 'vtt': + return parse_vtt(content) + return parse_srt(content) diff --git a/mod_test/smartdiff/vtt.py b/mod_test/smartdiff/vtt.py new file mode 100644 index 00000000..cd13af0f --- /dev/null +++ b/mod_test/smartdiff/vtt.py @@ -0,0 +1,65 @@ +"""Parse WebVTT (.vtt) subtitle output into structured cues.""" + +import re +from typing import List, Optional + +from mod_test.smartdiff.srt import Cue + +_TIMING_RE = re.compile( + r'(?:(\d{1,2}):)?(\d{2}):(\d{2})[.,](\d{3})\s*-->\s*' + r'(?:(\d{1,2}):)?(\d{2}):(\d{2})[.,](\d{3})' +) + +_METADATA_PREFIXES = ('WEBVTT', 'NOTE', 'STYLE', 'REGION') + + +def _to_ms(hours: Optional[str], minutes: str, seconds: str, millis: str) -> int: + """ + Convert WebVTT timestamp parts into total milliseconds. + + :param hours: Hours component, or None when absent (MM:SS.mmm form). + :type hours: Optional[str] + :param minutes: Minutes component. + :type minutes: str + :param seconds: Seconds component. + :type seconds: str + :param millis: Milliseconds component. + :type millis: str + :return: The timestamp in milliseconds. + :rtype: int + """ + hrs = int(hours) if hours else 0 + return ((hrs * 60 + int(minutes)) * 60 + int(seconds)) * 1000 + int(millis) + + +def parse_vtt(content: str) -> List[Cue]: + """ + Parse WebVTT subtitle text into a list of cues. + + Skips the ``WEBVTT`` header and ``NOTE``/``STYLE``/``REGION`` blocks, tolerates + an optional cue-identifier line, optional hours in timestamps, and trailing cue + settings after the end timestamp. + + :param content: Raw .vtt file content. + :type content: str + :return: The parsed cues, in file order. + :rtype: List[Cue] + """ + content = content.lstrip('').replace('\r\n', '\n').replace('\r', '\n') + cues: List[Cue] = [] + for block in re.split(r'\n[ \t]*\n', content.strip()): + lines = block.split('\n') + if lines[0].split(' ', 1)[0] in _METADATA_PREFIXES: + continue + timing_idx: Optional[int] = next( + (i for i, ln in enumerate(lines) if _TIMING_RE.search(ln)), None) + if timing_idx is None: + continue + match = _TIMING_RE.search(lines[timing_idx]) + if match is None: # pragma: no cover - guaranteed by the search above + continue + start_ms = _to_ms(match.group(1), match.group(2), match.group(3), match.group(4)) + end_ms = _to_ms(match.group(5), match.group(6), match.group(7), match.group(8)) + text = '\n'.join(lines[timing_idx + 1:]).strip() + cues.append(Cue(index=len(cues) + 1, start_ms=start_ms, end_ms=end_ms, text=text)) + return cues diff --git a/tests/test_smartdiff/test_compare.py b/tests/test_smartdiff/test_compare.py index 9d60a82b..2e241d29 100644 --- a/tests/test_smartdiff/test_compare.py +++ b/tests/test_smartdiff/test_compare.py @@ -69,6 +69,14 @@ def test_mixed_when_text_and_count_differ(self): result = smart_diff(_srt(_BASE), _srt(other)) self.assertEqual(result["kind"], "mixed") + def test_works_on_webvtt_via_autodetect(self): + """smart_diff auto-detects WebVTT and still classifies a timing shift.""" + base = "WEBVTT\n\n00:00:01.000 --> 00:00:04.000\nHello\n" + shifted = "WEBVTT\n\n00:00:01.250 --> 00:00:04.250\nHello\n" + result = smart_diff(base, shifted) + self.assertEqual(result["kind"], "timing_shift") + self.assertEqual(result["offset_ms"], 250) + if __name__ == "__main__": unittest.main() diff --git a/tests/test_smartdiff/test_vtt.py b/tests/test_smartdiff/test_vtt.py new file mode 100644 index 00000000..695aeb11 --- /dev/null +++ b/tests/test_smartdiff/test_vtt.py @@ -0,0 +1,44 @@ +"""Tests for the WebVTT (.vtt) parser.""" + +import unittest + +from mod_test.smartdiff.vtt import parse_vtt + +_VTT = ( + "WEBVTT\n" + "\n" + "NOTE this is a comment\n" + "\n" + "1\n" + "00:00:01.000 --> 00:00:04.000 align:start position:50%\n" + "Hello world\n" + "\n" + "00:05.500 --> 00:08.250\n" + "Second line\n" +) + + +class ParseVttTests(unittest.TestCase): + """Parsing WebVTT content into structured cues.""" + + def test_parses_cues_and_skips_metadata(self): + """The WEBVTT header and NOTE block are skipped; cues are parsed.""" + cues = parse_vtt(_VTT) + self.assertEqual(len(cues), 2) + self.assertEqual((cues[0].start_ms, cues[0].end_ms), (1000, 4000)) + self.assertEqual(cues[0].text, "Hello world") + + def test_ignores_trailing_cue_settings(self): + """Cue settings after the end timestamp do not leak into timing/text.""" + cues = parse_vtt(_VTT) + self.assertEqual(cues[0].end_ms, 4000) + self.assertEqual(cues[0].text, "Hello world") + + def test_handles_optional_hours(self): + """A MM:SS.mmm timestamp without an hours component is parsed correctly.""" + cues = parse_vtt(_VTT) + self.assertEqual((cues[1].start_ms, cues[1].end_ms), (5500, 8250)) + + +if __name__ == "__main__": + unittest.main() From cda6de8881826c3faf3d26f162934658e613f64f Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Sat, 27 Jun 2026 01:07:49 +0530 Subject: [PATCH 03/10] feat(smartdiff): CCExtractor-grounded normalization + cosmetic-diff kinds Mirror CCExtractor's own expected-output handling (tests/extract_expected.py): strip HTML/styling tags, unescape entities, and trim per-line trailing whitespace. This lets the comparator separate cosmetic differences from real text changes, adding two classifications: - formatting_change: cues differ only in tags/entities, not text. - whitespace_change: cues differ only in CEA-608 trailing padding. Parsers now preserve raw cue text (only surrounding blank lines are dropped) so the comparator, not the parser, decides what is cosmetic. Verified against a real CCExtractor CEA-608 sample. --- mod_test/smartdiff/compare.py | 73 +++++++++++++-------- mod_test/smartdiff/normalize.py | 90 ++++++++++++++++++++++++++ mod_test/smartdiff/srt.py | 22 ++++++- mod_test/smartdiff/vtt.py | 4 +- tests/test_smartdiff/test_compare.py | 13 ++++ tests/test_smartdiff/test_normalize.py | 42 ++++++++++++ 6 files changed, 213 insertions(+), 31 deletions(-) create mode 100644 mod_test/smartdiff/normalize.py create mode 100644 tests/test_smartdiff/test_normalize.py diff --git a/mod_test/smartdiff/compare.py b/mod_test/smartdiff/compare.py index a3f9aafb..2eea1d03 100644 --- a/mod_test/smartdiff/compare.py +++ b/mod_test/smartdiff/compare.py @@ -2,21 +2,10 @@ from typing import Dict, List, Optional +from mod_test.smartdiff.normalize import classify_text_pair from mod_test.smartdiff.parsing import parse_subtitles -def _norm(text: str) -> str: - """ - Normalise cue text for comparison (collapse whitespace, case-fold). - - :param text: Raw cue text. - :type text: str - :return: Normalised text. - :rtype: str - """ - return ' '.join(text.split()).casefold() - - def _result(kind: str, summary: str, n_exp: int, n_act: int, offset_ms: Optional[int] = None) -> Dict[str, object]: """ @@ -55,8 +44,9 @@ def smart_diff(expected: str, actual: str, content unless ``fmt`` is given. Aligns cues by position and reports the *kind* of difference rather than a raw line diff: ``identical``, ``timing_shift`` (with a consistent offset), ``text_change``, - ``missing_cues``, ``extra_cues``, or ``mixed``. The goal is an actionable - answer ("subtitles are +120 ms late") instead of a wall of changed lines. + ``formatting_change`` (tags/entities only), ``whitespace_change`` (CEA-608 + padding only), ``missing_cues``, ``extra_cues``, or ``mixed``. The goal is an + actionable answer ("subtitles are +120 ms late") instead of a wall of lines. :param expected: The expected/baseline subtitle content. :type expected: str @@ -74,18 +64,32 @@ def smart_diff(expected: str, actual: str, count_mismatch = n_exp != n_act text_changes = 0 + formatting_changes = 0 + whitespace_changes = 0 + raw_matches = True timing_deltas: List[int] = [] - for e, a in zip(exp, act): - if _norm(e.text) != _norm(a.text): + for expected_cue, actual_cue in zip(exp, act): + category = classify_text_pair(expected_cue.text, actual_cue.text) + if category != 'match': + raw_matches = False + if category == 'text': text_changes += 1 - else: - timing_deltas.append(a.start_ms - e.start_ms) + continue + if category == 'formatting': + formatting_changes += 1 + elif category == 'whitespace': + whitespace_changes += 1 + timing_deltas.append(actual_cue.start_ms - expected_cue.start_ms) + + no_timing_move = all(delta == 0 for delta in timing_deltas) + uniform_shift = bool(timing_deltas) and len(set(timing_deltas)) == 1 + cosmetic_changes = formatting_changes + whitespace_changes + fully_aligned = text_changes == 0 and cosmetic_changes == 0 - if not count_mismatch and text_changes == 0 and all(d == 0 for d in timing_deltas): + if not count_mismatch and raw_matches and no_timing_move: return _result('identical', 'Outputs are identical.', n_exp, n_act) - uniform_shift = bool(timing_deltas) and len(set(timing_deltas)) == 1 - if not count_mismatch and text_changes == 0 and uniform_shift and timing_deltas[0] != 0: + if not count_mismatch and fully_aligned and uniform_shift and timing_deltas[0] != 0: offset = timing_deltas[0] direction = 'late' if offset > 0 else 'early' return _result( @@ -104,14 +108,27 @@ def smart_diff(expected: str, actual: str, f'Output has {n_act - n_exp} extra cues ({n_act} vs {n_exp} expected).', n_exp, n_act) - if not count_mismatch and text_changes > 0 and all(d == 0 for d in timing_deltas): - return _result( - 'text_change', - f'{text_changes} of {n_exp} cues differ in text only (timing matches).', - n_exp, n_act) + if not count_mismatch and no_timing_move: + if text_changes > 0: + return _result( + 'text_change', + f'{text_changes} of {n_exp} cues differ in text (timing aligned).', + n_exp, n_act) + if formatting_changes > 0 and whitespace_changes == 0: + return _result( + 'formatting_change', + f'{formatting_changes} of {n_exp} cues differ only in formatting ' + f'(tags/entities), not text.', + n_exp, n_act) + if whitespace_changes > 0 and formatting_changes == 0: + return _result( + 'whitespace_change', + f'{whitespace_changes} of {n_exp} cues differ only in trailing ' + f'whitespace/padding.', + n_exp, n_act) return _result( 'mixed', - f'Mixed differences: {text_changes} text change(s) across ' - f'{min(n_exp, n_act)} compared cues; expected {n_exp}, got {n_act}.', + f'Mixed differences across {min(n_exp, n_act)} compared cues; ' + f'expected {n_exp}, got {n_act}.', n_exp, n_act) diff --git a/mod_test/smartdiff/normalize.py b/mod_test/smartdiff/normalize.py new file mode 100644 index 00000000..3ecfb2e8 --- /dev/null +++ b/mod_test/smartdiff/normalize.py @@ -0,0 +1,90 @@ +"""Normalisation that mirrors CCExtractor's own expected-output handling. + +CCExtractor's test harness (``tests/extract_expected.py``) compares outputs +after stripping HTML/styling tags, unescaping entities, and trimming trailing +whitespace from each line (CEA-608 captions are space-padded to a fixed grid). +Reusing the same rules lets the smart diff separate a *cosmetic* difference +(padding or styling only) from a real text change. +""" + +import re + +_TAG_RE = re.compile(r'<[^>]+>') + +# Same entities CCExtractor's extract_expected.py unescapes; '&' is applied +# last so an escaped entity like '&lt;' is not double-decoded. +_ENTITIES = ( + ('<', '<'), ('>', '>'), ('"', '"'), (''', "'"), + ('°', '°'), (' ', ' '), ('&', '&'), +) + + +def strip_tags(text: str) -> str: + """ + Remove HTML/styling tags such as ```` or ````. + + :param text: Raw cue text. + :type text: str + :return: Text with tags removed. + :rtype: str + """ + return _TAG_RE.sub('', text) + + +def unescape(text: str) -> str: + """ + Unescape the HTML entities CCExtractor emits. + + :param text: Raw cue text. + :type text: str + :return: Text with entities decoded. + :rtype: str + """ + for entity, char in _ENTITIES: + text = text.replace(entity, char) + return text + + +def rstrip_lines(text: str) -> str: + """ + Trim trailing whitespace from each line (CEA-608 padding is cosmetic). + + :param text: Raw cue text. + :type text: str + :return: Text with per-line trailing whitespace removed. + :rtype: str + """ + return '\n'.join(line.rstrip() for line in text.split('\n')) + + +def plain(text: str) -> str: + """ + Fully normalise: unescape entities, strip tags, trim trailing whitespace. + + :param text: Raw cue text. + :type text: str + :return: The fully normalised text. + :rtype: str + """ + return rstrip_lines(strip_tags(unescape(text))) + + +def classify_text_pair(expected: str, actual: str) -> str: + """ + Classify how two cue texts differ, ignoring progressively more cosmetics. + + :param expected: Expected cue text. + :type expected: str + :param actual: Actual cue text. + :type actual: str + :return: ``match`` (identical), ``whitespace`` (only trailing padding differs), + ``formatting`` (only tags/entities differ), or ``text`` (a real change). + :rtype: str + """ + if expected == actual: + return 'match' + if rstrip_lines(expected) == rstrip_lines(actual): + return 'whitespace' + if plain(expected) == plain(actual): + return 'formatting' + return 'text' diff --git a/mod_test/smartdiff/srt.py b/mod_test/smartdiff/srt.py index 427b70cc..3f7388ed 100644 --- a/mod_test/smartdiff/srt.py +++ b/mod_test/smartdiff/srt.py @@ -31,6 +31,26 @@ class Cue: text: str +def join_cue_text(lines: List[str]) -> str: + """ + Join cue text lines, dropping surrounding blank lines but keeping trailing spaces. + + Trailing whitespace is preserved on purpose: CCExtractor pads CEA-608 captions, + and the comparator (not the parser) decides whether that padding is cosmetic. + + :param lines: The text lines following a cue's timing line. + :type lines: List[str] + :return: The joined cue text. + :rtype: str + """ + start, end = 0, len(lines) + while start < end and lines[start].strip() == '': + start += 1 + while end > start and lines[end - 1].strip() == '': + end -= 1 + return '\n'.join(lines[start:end]) + + def _to_ms(hours: str, minutes: str, seconds: str, millis: str) -> int: """ Convert the parts of an SRT timestamp into total milliseconds. @@ -77,6 +97,6 @@ def parse_srt(content: str) -> List[Cue]: index = len(cues) + 1 if timing_idx > 0 and lines[timing_idx - 1].strip().isdigit(): index = int(lines[timing_idx - 1].strip()) - text = '\n'.join(lines[timing_idx + 1:]).strip() + text = join_cue_text(lines[timing_idx + 1:]) cues.append(Cue(index=index, start_ms=start_ms, end_ms=end_ms, text=text)) return cues diff --git a/mod_test/smartdiff/vtt.py b/mod_test/smartdiff/vtt.py index cd13af0f..dbb12e84 100644 --- a/mod_test/smartdiff/vtt.py +++ b/mod_test/smartdiff/vtt.py @@ -3,7 +3,7 @@ import re from typing import List, Optional -from mod_test.smartdiff.srt import Cue +from mod_test.smartdiff.srt import Cue, join_cue_text _TIMING_RE = re.compile( r'(?:(\d{1,2}):)?(\d{2}):(\d{2})[.,](\d{3})\s*-->\s*' @@ -60,6 +60,6 @@ def parse_vtt(content: str) -> List[Cue]: continue start_ms = _to_ms(match.group(1), match.group(2), match.group(3), match.group(4)) end_ms = _to_ms(match.group(5), match.group(6), match.group(7), match.group(8)) - text = '\n'.join(lines[timing_idx + 1:]).strip() + text = join_cue_text(lines[timing_idx + 1:]) cues.append(Cue(index=len(cues) + 1, start_ms=start_ms, end_ms=end_ms, text=text)) return cues diff --git a/tests/test_smartdiff/test_compare.py b/tests/test_smartdiff/test_compare.py index 2e241d29..dbbc034d 100644 --- a/tests/test_smartdiff/test_compare.py +++ b/tests/test_smartdiff/test_compare.py @@ -27,6 +27,7 @@ def stamp(ms): _BASE = [(1000, 4000, "Hello world"), (5000, 8000, "Second line")] +_BASE_CAPS = [(1000, 4000, "HELLO WORLD"), (5000, 8000, "SECOND LINE")] class SmartDiffTests(unittest.TestCase): @@ -77,6 +78,18 @@ def test_works_on_webvtt_via_autodetect(self): self.assertEqual(result["kind"], "timing_shift") self.assertEqual(result["offset_ms"], 250) + def test_whitespace_padding_only(self): + """Trailing CEA-608 padding differences are flagged as cosmetic, not text.""" + padded = [(1000, 4000, "HELLO WORLD "), (5000, 8000, "SECOND LINE ")] + result = smart_diff(_srt(_BASE_CAPS), _srt(padded)) + self.assertEqual(result["kind"], "whitespace_change") + + def test_formatting_tags_only(self): + """A styling-tags-only difference is flagged as formatting, not text.""" + styled = [(1000, 4000, "Hello world"), (5000, 8000, "Second line")] + result = smart_diff(_srt(_BASE), _srt(styled)) + self.assertEqual(result["kind"], "formatting_change") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_smartdiff/test_normalize.py b/tests/test_smartdiff/test_normalize.py new file mode 100644 index 00000000..6d7183d2 --- /dev/null +++ b/tests/test_smartdiff/test_normalize.py @@ -0,0 +1,42 @@ +"""Tests for CCExtractor-style normalisation of cue text.""" + +import unittest + +from mod_test.smartdiff.normalize import (classify_text_pair, plain, + strip_tags, unescape) + + +class NormalizeTests(unittest.TestCase): + """Tag stripping, entity unescaping, and cue-text classification.""" + + def test_strip_tags(self): + """HTML/styling tags are removed.""" + self.assertEqual(strip_tags('hi'), 'hi') + + def test_unescape_entities(self): + """Known HTML entities are decoded, including a nested &.""" + self.assertEqual(unescape('a <b> & 30°'), 'a & 30°') + + def test_plain_combines_rules(self): + """plain() strips tags, unescapes, and rstrips padding together.""" + self.assertEqual(plain('hi & bye '), 'hi & bye') + + def test_classify_match(self): + """Identical text classifies as match.""" + self.assertEqual(classify_text_pair('hello', 'hello'), 'match') + + def test_classify_whitespace_only(self): + """Trailing CEA-608 padding differences classify as whitespace.""" + self.assertEqual(classify_text_pair('HELLO WORLD', 'HELLO WORLD '), 'whitespace') + + def test_classify_formatting_only(self): + """A tags-only difference classifies as formatting.""" + self.assertEqual(classify_text_pair('hello', 'hello'), 'formatting') + + def test_classify_real_text_change(self): + """A genuine text change classifies as text.""" + self.assertEqual(classify_text_pair('hello', 'goodbye'), 'text') + + +if __name__ == "__main__": + unittest.main() From 8f59b35d22aeb50e085952466d0561bf161ff2d5 Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Sat, 27 Jun 2026 01:21:02 +0530 Subject: [PATCH 04/10] feat(smartdiff): timing drift, split/merged cues, real golden fixtures - timing_drift: detect a growing (non-constant) offset across cues, the signature of a progressive sync bug, distinct from a constant timing_shift. - split_cues / merged_cues: when cue count changes but the text content is unchanged, report re-segmentation instead of missing/extra cues. - Vendor a real CCExtractor CEA-608 sample (tests/.../fixtures/cea608_real.srt) and add golden-fixture tests so the diff is exercised on true output: identical, constant shift, and cosmetic de-padding. --- mod_test/smartdiff/compare.py | 74 +++++++++++++++---- tests/test_smartdiff/fixtures/cea608_real.srt | 9 +++ tests/test_smartdiff/test_compare.py | 21 ++++++ tests/test_smartdiff/test_fixtures.py | 58 +++++++++++++++ 4 files changed, 149 insertions(+), 13 deletions(-) create mode 100644 tests/test_smartdiff/fixtures/cea608_real.srt create mode 100644 tests/test_smartdiff/test_fixtures.py diff --git a/mod_test/smartdiff/compare.py b/mod_test/smartdiff/compare.py index 2eea1d03..801c108f 100644 --- a/mod_test/smartdiff/compare.py +++ b/mod_test/smartdiff/compare.py @@ -2,8 +2,9 @@ from typing import Dict, List, Optional -from mod_test.smartdiff.normalize import classify_text_pair +from mod_test.smartdiff.normalize import classify_text_pair, plain from mod_test.smartdiff.parsing import parse_subtitles +from mod_test.smartdiff.srt import Cue def _result(kind: str, summary: str, n_exp: int, n_act: int, @@ -35,6 +36,33 @@ def _result(kind: str, summary: str, n_exp: int, n_act: int, return out +def _content(cues: List[Cue]) -> str: + """ + Join all cues' normalised, whitespace-collapsed text — for split/merge detection. + + :param cues: The parsed cues. + :type cues: List[Cue] + :return: A single normalised token string spanning every cue. + :rtype: str + """ + return ' '.join(' '.join(plain(cue.text).split()) for cue in cues) + + +def _monotonic(values: List[int]) -> bool: + """ + Report whether a sequence is non-decreasing or non-increasing. + + :param values: The sequence to test. + :type values: List[int] + :return: True if monotonic in either direction. + :rtype: bool + """ + pairs = list(zip(values, values[1:])) + non_decreasing = all(a <= b for a, b in pairs) + non_increasing = all(a >= b for a, b in pairs) + return non_decreasing or non_increasing + + def smart_diff(expected: str, actual: str, fmt: Optional[str] = None) -> Dict[str, object]: """ @@ -43,10 +71,10 @@ def smart_diff(expected: str, actual: str, Supports SubRip (.srt) and WebVTT (.vtt); the format is auto-detected from content unless ``fmt`` is given. Aligns cues by position and reports the *kind* of difference rather than a raw line diff: ``identical``, - ``timing_shift`` (with a consistent offset), ``text_change``, - ``formatting_change`` (tags/entities only), ``whitespace_change`` (CEA-608 - padding only), ``missing_cues``, ``extra_cues``, or ``mixed``. The goal is an - actionable answer ("subtitles are +120 ms late") instead of a wall of lines. + ``timing_shift`` (constant offset), ``timing_drift`` (growing offset), + ``text_change``, ``formatting_change`` (tags/entities only), + ``whitespace_change`` (CEA-608 padding only), ``split_cues``, + ``merged_cues``, ``missing_cues``, ``extra_cues``, or ``mixed``. :param expected: The expected/baseline subtitle content. :type expected: str @@ -83,6 +111,8 @@ def smart_diff(expected: str, actual: str, no_timing_move = all(delta == 0 for delta in timing_deltas) uniform_shift = bool(timing_deltas) and len(set(timing_deltas)) == 1 + varying_timing = len(set(timing_deltas)) > 1 + drifting = varying_timing and _monotonic(timing_deltas) cosmetic_changes = formatting_changes + whitespace_changes fully_aligned = text_changes == 0 and cosmetic_changes == 0 @@ -97,17 +127,35 @@ def smart_diff(expected: str, actual: str, f'All {n_exp} cues match but are {abs(offset)} ms {direction}.', n_exp, n_act, offset_ms=offset) - if count_mismatch and text_changes == 0: - if n_act < n_exp: - return _result( - 'missing_cues', - f'{n_exp - n_act} of {n_exp} cues are missing from the output.', - n_exp, n_act) + if not count_mismatch and fully_aligned and drifting: + first, last = timing_deltas[0], timing_deltas[-1] return _result( - 'extra_cues', - f'Output has {n_act - n_exp} extra cues ({n_act} vs {n_exp} expected).', + 'timing_drift', + f'Timing drifts from {first:+d} ms to {last:+d} ms across {n_exp} cues.', n_exp, n_act) + if count_mismatch: + if _content(exp) and _content(exp) == _content(act): + if n_act > n_exp: + return _result( + 'split_cues', + f'Same text, but cues were split: expected {n_exp}, got {n_act}.', + n_exp, n_act) + return _result( + 'merged_cues', + f'Same text, but cues were merged: expected {n_exp}, got {n_act}.', + n_exp, n_act) + if text_changes == 0: + if n_act < n_exp: + return _result( + 'missing_cues', + f'{n_exp - n_act} of {n_exp} cues are missing from the output.', + n_exp, n_act) + return _result( + 'extra_cues', + f'Output has {n_act - n_exp} extra cues ({n_act} vs {n_exp} expected).', + n_exp, n_act) + if not count_mismatch and no_timing_move: if text_changes > 0: return _result( diff --git a/tests/test_smartdiff/fixtures/cea608_real.srt b/tests/test_smartdiff/fixtures/cea608_real.srt new file mode 100644 index 00000000..d0bf07ab --- /dev/null +++ b/tests/test_smartdiff/fixtures/cea608_real.srt @@ -0,0 +1,9 @@ +1 +00:00:05,956 --> 00:00:07,955 +CCextractor Start crdit Testing + +2 +00:00:13,913 --> 00:00:15,080 +>> WHICH OF THESE STORIES WILL +YOU BE TALKING ABOUT TRO + diff --git a/tests/test_smartdiff/test_compare.py b/tests/test_smartdiff/test_compare.py index dbbc034d..00de1746 100644 --- a/tests/test_smartdiff/test_compare.py +++ b/tests/test_smartdiff/test_compare.py @@ -90,6 +90,27 @@ def test_formatting_tags_only(self): result = smart_diff(_srt(_BASE), _srt(styled)) self.assertEqual(result["kind"], "formatting_change") + def test_timing_drift_growing_offset(self): + """A growing (not constant) timing offset classifies as timing_drift.""" + base = [(1000, 2000, "A"), (5000, 6000, "B"), (9000, 10000, "C")] + drifted = [(1000, 2000, "A"), (5040, 6040, "B"), (9080, 10080, "C")] + result = smart_diff(_srt(base), _srt(drifted)) + self.assertEqual(result["kind"], "timing_drift") + + def test_split_cues_same_text_more_cues(self): + """One cue rendered as two (same words) classifies as split_cues.""" + one = [(1000, 4000, "hello world")] + two = [(1000, 2000, "hello"), (2000, 4000, "world")] + result = smart_diff(_srt(one), _srt(two)) + self.assertEqual(result["kind"], "split_cues") + + def test_merged_cues_same_text_fewer_cues(self): + """Two cues collapsed into one (same words) classifies as merged_cues.""" + two = [(1000, 2000, "hello"), (2000, 4000, "world")] + one = [(1000, 4000, "hello world")] + result = smart_diff(_srt(two), _srt(one)) + self.assertEqual(result["kind"], "merged_cues") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_smartdiff/test_fixtures.py b/tests/test_smartdiff/test_fixtures.py new file mode 100644 index 00000000..cac9c479 --- /dev/null +++ b/tests/test_smartdiff/test_fixtures.py @@ -0,0 +1,58 @@ +"""Golden-fixture tests against a real CCExtractor CEA-608 sample output.""" + +import os +import unittest + +from mod_test.smartdiff.compare import smart_diff +from mod_test.smartdiff.srt import parse_srt + +_FIXTURE = os.path.join(os.path.dirname(__file__), 'fixtures', 'cea608_real.srt') + + +def _load(): + """ + Read the vendored real CCExtractor sample. + + :return: The raw .srt content. + :rtype: str + """ + with open(_FIXTURE, encoding='utf-8') as handle: + return handle.read() + + +class RealSampleTests(unittest.TestCase): + """Exercise the smart diff on genuine CCExtractor output, not synthetic strings.""" + + def test_parses_real_sample(self): + """The real sample parses into its two CEA-608 cues.""" + cues = parse_srt(_load()) + self.assertEqual(len(cues), 2) + self.assertEqual(cues[0].start_ms, 5956) + + def test_identical_against_itself(self): + """The real sample compared with itself is identical.""" + raw = _load() + self.assertEqual(smart_diff(raw, raw)["kind"], "identical") + + def test_constant_shift_on_real_sample(self): + """Shifting every timestamp by a constant is detected as timing_shift.""" + raw = _load() + shifted = (raw + .replace('00:00:05,956', '00:00:06,206') + .replace('00:00:07,955', '00:00:08,205') + .replace('00:00:13,913', '00:00:14,163') + .replace('00:00:15,080', '00:00:15,330')) + result = smart_diff(raw, shifted) + self.assertEqual(result["kind"], "timing_shift") + self.assertEqual(result["offset_ms"], 250) + + def test_depadding_is_cosmetic_on_real_sample(self): + """Stripping the CEA-608 trailing padding is flagged as cosmetic only.""" + raw = _load() + depadded = '\n'.join(line.rstrip() for line in raw.split('\n')) + result = smart_diff(raw, depadded) + self.assertIn(result["kind"], ("identical", "whitespace_change")) + + +if __name__ == "__main__": + unittest.main() From 9c96da9738a177d079649b19d0d7e5da8ad5f05b Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Mon, 29 Jun 2026 23:06:00 +0530 Subject: [PATCH 05/10] feat(smartdiff): detect encoding-only differences (non-ASCII/accents) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ascii_fold() and an 'encoding' text category so the comparator can tell a charset difference (e.g. CCExtractor's -latin1 output: 'Voilà' vs 'Voila') from a real word change. Surfaced as a new 'encoding_change' classification. --- mod_test/smartdiff/compare.py | 20 +++++++++++++----- mod_test/smartdiff/normalize.py | 28 ++++++++++++++++++++++++-- tests/test_smartdiff/test_compare.py | 7 +++++++ tests/test_smartdiff/test_normalize.py | 12 +++++++++-- 4 files changed, 58 insertions(+), 9 deletions(-) diff --git a/mod_test/smartdiff/compare.py b/mod_test/smartdiff/compare.py index 801c108f..db0b1366 100644 --- a/mod_test/smartdiff/compare.py +++ b/mod_test/smartdiff/compare.py @@ -73,8 +73,9 @@ def smart_diff(expected: str, actual: str, *kind* of difference rather than a raw line diff: ``identical``, ``timing_shift`` (constant offset), ``timing_drift`` (growing offset), ``text_change``, ``formatting_change`` (tags/entities only), - ``whitespace_change`` (CEA-608 padding only), ``split_cues``, - ``merged_cues``, ``missing_cues``, ``extra_cues``, or ``mixed``. + ``whitespace_change`` (CEA-608 padding only), ``encoding_change`` + (non-ASCII/accented characters only), ``split_cues``, ``merged_cues``, + ``missing_cues``, ``extra_cues``, or ``mixed``. :param expected: The expected/baseline subtitle content. :type expected: str @@ -94,6 +95,7 @@ def smart_diff(expected: str, actual: str, text_changes = 0 formatting_changes = 0 whitespace_changes = 0 + encoding_changes = 0 raw_matches = True timing_deltas: List[int] = [] for expected_cue, actual_cue in zip(exp, act): @@ -107,13 +109,15 @@ def smart_diff(expected: str, actual: str, formatting_changes += 1 elif category == 'whitespace': whitespace_changes += 1 + elif category == 'encoding': + encoding_changes += 1 timing_deltas.append(actual_cue.start_ms - expected_cue.start_ms) no_timing_move = all(delta == 0 for delta in timing_deltas) uniform_shift = bool(timing_deltas) and len(set(timing_deltas)) == 1 varying_timing = len(set(timing_deltas)) > 1 drifting = varying_timing and _monotonic(timing_deltas) - cosmetic_changes = formatting_changes + whitespace_changes + cosmetic_changes = formatting_changes + whitespace_changes + encoding_changes fully_aligned = text_changes == 0 and cosmetic_changes == 0 if not count_mismatch and raw_matches and no_timing_move: @@ -162,13 +166,19 @@ def smart_diff(expected: str, actual: str, 'text_change', f'{text_changes} of {n_exp} cues differ in text (timing aligned).', n_exp, n_act) - if formatting_changes > 0 and whitespace_changes == 0: + if encoding_changes > 0 and formatting_changes == 0 and whitespace_changes == 0: + return _result( + 'encoding_change', + f'{encoding_changes} of {n_exp} cues differ only in character ' + f'encoding (non-ASCII/accented characters).', + n_exp, n_act) + if formatting_changes > 0 and whitespace_changes == 0 and encoding_changes == 0: return _result( 'formatting_change', f'{formatting_changes} of {n_exp} cues differ only in formatting ' f'(tags/entities), not text.', n_exp, n_act) - if whitespace_changes > 0 and formatting_changes == 0: + if whitespace_changes > 0 and formatting_changes == 0 and encoding_changes == 0: return _result( 'whitespace_change', f'{whitespace_changes} of {n_exp} cues differ only in trailing ' diff --git a/mod_test/smartdiff/normalize.py b/mod_test/smartdiff/normalize.py index 3ecfb2e8..6a7d1cdd 100644 --- a/mod_test/smartdiff/normalize.py +++ b/mod_test/smartdiff/normalize.py @@ -8,6 +8,7 @@ """ import re +import unicodedata _TAG_RE = re.compile(r'<[^>]+>') @@ -69,6 +70,23 @@ def plain(text: str) -> str: return rstrip_lines(strip_tags(unescape(text))) +def ascii_fold(text: str) -> str: + """ + Fold text to ASCII by decomposing accents and dropping non-ASCII characters. + + Lets the comparator tell a charset/encoding difference (e.g. CCExtractor's + ``-latin1`` output) from a real word change: 'Voilà' and 'Voila' share an + ASCII skeleton, so only their non-ASCII characters differ. + + :param text: Raw cue text. + :type text: str + :return: The ASCII skeleton of the text. + :rtype: str + """ + decomposed = unicodedata.normalize('NFKD', text) + return ''.join(ch for ch in decomposed if ord(ch) < 128) + + def classify_text_pair(expected: str, actual: str) -> str: """ Classify how two cue texts differ, ignoring progressively more cosmetics. @@ -78,13 +96,19 @@ def classify_text_pair(expected: str, actual: str) -> str: :param actual: Actual cue text. :type actual: str :return: ``match`` (identical), ``whitespace`` (only trailing padding differs), - ``formatting`` (only tags/entities differ), or ``text`` (a real change). + ``formatting`` (only tags/entities differ), ``encoding`` (only non-ASCII + characters differ), or ``text`` (a real change). :rtype: str """ if expected == actual: return 'match' if rstrip_lines(expected) == rstrip_lines(actual): return 'whitespace' - if plain(expected) == plain(actual): + expected_plain = plain(expected) + actual_plain = plain(actual) + if expected_plain == actual_plain: return 'formatting' + has_non_ascii = any(ord(ch) > 127 for ch in expected_plain + actual_plain) + if has_non_ascii and ascii_fold(expected_plain) == ascii_fold(actual_plain): + return 'encoding' return 'text' diff --git a/tests/test_smartdiff/test_compare.py b/tests/test_smartdiff/test_compare.py index 00de1746..ef3e4e27 100644 --- a/tests/test_smartdiff/test_compare.py +++ b/tests/test_smartdiff/test_compare.py @@ -111,6 +111,13 @@ def test_merged_cues_same_text_fewer_cues(self): result = smart_diff(_srt(two), _srt(one)) self.assertEqual(result["kind"], "merged_cues") + def test_encoding_change_non_ascii_only(self): + """A charset difference (accents only, e.g. -latin1) is flagged as encoding.""" + accented = [(1000, 4000, "Voilà"), (5000, 8000, "naïve café")] + folded = [(1000, 4000, "Voila"), (5000, 8000, "naive cafe")] + result = smart_diff(_srt(accented), _srt(folded)) + self.assertEqual(result["kind"], "encoding_change") + if __name__ == "__main__": unittest.main() diff --git a/tests/test_smartdiff/test_normalize.py b/tests/test_smartdiff/test_normalize.py index 6d7183d2..9c107ccc 100644 --- a/tests/test_smartdiff/test_normalize.py +++ b/tests/test_smartdiff/test_normalize.py @@ -2,8 +2,8 @@ import unittest -from mod_test.smartdiff.normalize import (classify_text_pair, plain, - strip_tags, unescape) +from mod_test.smartdiff.normalize import (ascii_fold, classify_text_pair, + plain, strip_tags, unescape) class NormalizeTests(unittest.TestCase): @@ -33,6 +33,14 @@ def test_classify_formatting_only(self): """A tags-only difference classifies as formatting.""" self.assertEqual(classify_text_pair('hello', 'hello'), 'formatting') + def test_ascii_fold_decomposes_accents(self): + """ascii_fold strips accents and drops non-ASCII characters.""" + self.assertEqual(ascii_fold('Voilà café ♪'), 'Voila cafe ') + + def test_classify_encoding_only(self): + """A non-ASCII/accent-only difference classifies as encoding.""" + self.assertEqual(classify_text_pair('PRÉCIS', 'PRECIS'), 'encoding') + def test_classify_real_text_change(self): """A genuine text change classifies as text.""" self.assertEqual(classify_text_pair('hello', 'goodbye'), 'text') From 27ab78bf54d027c02f6b282b494ec2bbe7bdf67e Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Tue, 30 Jun 2026 00:05:10 +0530 Subject: [PATCH 06/10] test(smartdiff): real DVB Spanish golden fixture + strict & robustness tests - Vendor dvb_spanish_real.srt: a genuine CCExtractor DVB Spanish output with colour tags and accented text. Security-scanned before vendoring (no paths/IPs/emails/URLs/secrets) and verified valid UTF-8. - Strict fixture tests assert exact kinds and values on real output: identical, timing_shift (offset 500), formatting_change (font tags), encoding_change (accent folding), missing_cues. - Robustness tests: malformed/empty/control-byte/garbage input must classify cleanly and never raise. Note: the available "Chinese" DVB samples were failed OCR (no real CJK, and invalid UTF-8), so they were deliberately not vendored. --- .../fixtures/dvb_spanish_real.srt | 59 ++++++++ tests/test_smartdiff/test_fixtures.py | 133 ++++++++++++++---- 2 files changed, 163 insertions(+), 29 deletions(-) create mode 100644 tests/test_smartdiff/fixtures/dvb_spanish_real.srt diff --git a/tests/test_smartdiff/fixtures/dvb_spanish_real.srt b/tests/test_smartdiff/fixtures/dvb_spanish_real.srt new file mode 100644 index 00000000..76a633c4 --- /dev/null +++ b/tests/test_smartdiff/fixtures/dvb_spanish_real.srt @@ -0,0 +1,59 @@ +1 +00:00:00,480 --> 00:01:05,479 +Para continuar con este debate, + +2 +00:00:06,080 --> 00:01:11,079 +gusted cree que si los partidarios +de Errejon fuesen derrotados + +3 +00:00:09,880 --> 00:01:14,879 +su propuesta en) Vistalegre, + +4 +00:00:12,920 --> 00:01:17,919 +Podemos deberia cambiar de portavoz +parlamentario? + +5 +00:00:19,080 --> 00:01:24,079 +éPuede representar al partido + +6 +00:00:21,400 --> 00:01:26,399 +en el Congreso alguienque'se ha +quedado en minoria + +7 +00:00:24,200 --> 00:01:29,199 +dentro del partido? + +8 +00:00:30,640 --> 00:01:35,639 +-Deciden los organos del partido la +linea de accion politica + +9 +00:00:34,200 --> 00:01:39,199 +dentro del partido. + +10 +00:00:43,120 --> 00:01:48,119 +Debemos acatar las decisiones +colectivas. + +11 +00:00:48,600 --> 00:01:53,599 +Si inicio Errejon reconoce que'se +ven esas lineas, + +12 +00:00:51,760 --> 00:01:56,759 +debe seguir adelante. + +13 +00:00:53,240 --> 00:01:58,239 +Solo.es canalizarla voz della +decision politica del partido. + diff --git a/tests/test_smartdiff/test_fixtures.py b/tests/test_smartdiff/test_fixtures.py index cac9c479..997f08dc 100644 --- a/tests/test_smartdiff/test_fixtures.py +++ b/tests/test_smartdiff/test_fixtures.py @@ -1,57 +1,132 @@ -"""Golden-fixture tests against a real CCExtractor CEA-608 sample output.""" +"""Golden-fixture tests against real CCExtractor output, plus input robustness. + +The fixtures are genuine CCExtractor outputs (not synthetic strings): +- ``cea608_real.srt``: a CEA-608 broadcast caption sample (trailing padding). +- ``dvb_spanish_real.srt``: a DVB Spanish sample with ```` colour tags and + accented characters. Both were security-scanned before vendoring (no paths, + IPs, emails, URLs, or secrets) and are valid UTF-8. +""" import os import unittest from mod_test.smartdiff.compare import smart_diff -from mod_test.smartdiff.srt import parse_srt +from mod_test.smartdiff.normalize import ascii_fold, strip_tags +from mod_test.smartdiff.srt import Cue, parse_srt -_FIXTURE = os.path.join(os.path.dirname(__file__), 'fixtures', 'cea608_real.srt') +_FIXTURES = os.path.join(os.path.dirname(__file__), 'fixtures') -def _load(): +def _load(name): """ - Read the vendored real CCExtractor sample. + Read a vendored fixture as UTF-8. - :return: The raw .srt content. + :param name: Fixture file name. + :type name: str + :return: The file content. :rtype: str """ - with open(_FIXTURE, encoding='utf-8') as handle: + with open(os.path.join(_FIXTURES, name), encoding='utf-8') as handle: return handle.read() -class RealSampleTests(unittest.TestCase): - """Exercise the smart diff on genuine CCExtractor output, not synthetic strings.""" +def _emit(cues): + """ + Serialise cues back to SubRip text (for building timing-shifted variants). + + :param cues: The cues to serialise. + :type cues: list + :return: SubRip-formatted text. + :rtype: str + """ + def stamp(ms): + hours, ms = divmod(ms, 3600000) + minutes, ms = divmod(ms, 60000) + seconds, ms = divmod(ms, 1000) + return f"{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}" + + return "\n".join(f"{i}\n{stamp(c.start_ms)} --> {stamp(c.end_ms)}\n{c.text}\n" + for i, c in enumerate(cues, 1)) + + +class Cea608RealTests(unittest.TestCase): + """Smart diff on a genuine CEA-608 broadcast caption sample.""" def test_parses_real_sample(self): """The real sample parses into its two CEA-608 cues.""" - cues = parse_srt(_load()) + cues = parse_srt(_load('cea608_real.srt')) self.assertEqual(len(cues), 2) self.assertEqual(cues[0].start_ms, 5956) def test_identical_against_itself(self): """The real sample compared with itself is identical.""" - raw = _load() - self.assertEqual(smart_diff(raw, raw)["kind"], "identical") - - def test_constant_shift_on_real_sample(self): - """Shifting every timestamp by a constant is detected as timing_shift.""" - raw = _load() - shifted = (raw - .replace('00:00:05,956', '00:00:06,206') - .replace('00:00:07,955', '00:00:08,205') - .replace('00:00:13,913', '00:00:14,163') - .replace('00:00:15,080', '00:00:15,330')) - result = smart_diff(raw, shifted) - self.assertEqual(result["kind"], "timing_shift") - self.assertEqual(result["offset_ms"], 250) - - def test_depadding_is_cosmetic_on_real_sample(self): + raw = _load('cea608_real.srt') + self.assertEqual(smart_diff(raw, raw)['kind'], 'identical') + + def test_depadding_is_cosmetic(self): """Stripping the CEA-608 trailing padding is flagged as cosmetic only.""" - raw = _load() + raw = _load('cea608_real.srt') depadded = '\n'.join(line.rstrip() for line in raw.split('\n')) - result = smart_diff(raw, depadded) - self.assertIn(result["kind"], ("identical", "whitespace_change")) + self.assertIn(smart_diff(raw, depadded)['kind'], + ('identical', 'whitespace_change')) + + +class DvbSpanishRealTests(unittest.TestCase): + """Smart diff on a real DVB Spanish output (font colour tags + accents).""" + + def test_parses_with_tags_and_accents(self): + """The fixture has 13 cues carrying both font tags and non-ASCII text.""" + cues = parse_srt(_load('dvb_spanish_real.srt')) + self.assertEqual(len(cues), 13) + self.assertTrue(any(' 127 for c in cues for ch in c.text)) + + def test_identical(self): + """The fixture compared with itself is identical.""" + raw = _load('dvb_spanish_real.srt') + self.assertEqual(smart_diff(raw, raw)['kind'], 'identical') + + def test_constant_timing_shift(self): + """Shifting every cue by +500 ms is detected with the exact offset.""" + cues = parse_srt(_load('dvb_spanish_real.srt')) + shifted = [Cue(c.index, c.start_ms + 500, c.end_ms + 500, c.text) for c in cues] + result = smart_diff(_emit(cues), _emit(shifted)) + self.assertEqual(result['kind'], 'timing_shift') + self.assertEqual(result['offset_ms'], 500) + + def test_font_tags_are_formatting_only(self): + """Removing the colour tags is classified as formatting, not text.""" + raw = _load('dvb_spanish_real.srt') + self.assertEqual(smart_diff(raw, strip_tags(raw))['kind'], 'formatting_change') + + def test_accent_folding_is_encoding(self): + """Folding the accented characters is classified as an encoding difference.""" + raw = _load('dvb_spanish_real.srt') + self.assertEqual(smart_diff(raw, ascii_fold(raw))['kind'], 'encoding_change') + + def test_dropped_cues_are_missing(self): + """Dropping the last three cues is reported as missing_cues.""" + cues = parse_srt(_load('dvb_spanish_real.srt')) + result = smart_diff(_emit(cues), _emit(cues[:-3])) + self.assertEqual(result['kind'], 'missing_cues') + + +class RobustnessTests(unittest.TestCase): + """Malformed or hostile input must classify cleanly, never crash.""" + + def test_parser_survives_garbage(self): + """The parser returns a list for empty, junk, and control-byte input.""" + for junk in ['', 'not a subtitle', '\x00\x01\x02', '1\nno timing line\n']: + self.assertIsInstance(parse_srt(junk), list) + + def test_smart_diff_on_empty_inputs(self): + """Two empty inputs are identical, not an error.""" + self.assertEqual(smart_diff('', '')['kind'], 'identical') + + def test_smart_diff_garbage_vs_real(self): + """Garbage against a real sample classifies without raising.""" + result = smart_diff('garbage with no cues', _load('dvb_spanish_real.srt')) + self.assertIn('kind', result) if __name__ == "__main__": From 57b0a253d6c8f889092a8b940511e99937e7caf1 Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Tue, 30 Jun 2026 00:35:48 +0530 Subject: [PATCH 07/10] feat(smartdiff): expose smart diff via endpoint + a "Smart" option in the UI - TestResultFile.generate_smart_diff(): reads the expected/actual output files (reusing the encoding-tolerant read_lines) and returns a semantic classification via smart_diff. - New JSON endpoint GET /diff////smart, reusable by the web UI, the CLI, and agents. Returns 'unavailable' gracefully if the output files are not on disk. - Result page: a "Smart" link next to each "Fail" diff link opens a small popup with the difference kind + summary (additive, opt-in). Includes a unit test of the model glue against real on-disk files. --- mod_test/controllers.py | 38 +++++++++++ mod_test/models.py | 27 ++++++++ templates/test/by_id.html | 30 ++++++++- .../test_smartdiff/test_model_integration.py | 63 +++++++++++++++++++ 4 files changed, 156 insertions(+), 2 deletions(-) create mode 100644 tests/test_smartdiff/test_model_integration.py diff --git a/mod_test/controllers.py b/mod_test/controllers.py index 4c2477b8..19bff476 100644 --- a/mod_test/controllers.py +++ b/mod_test/controllers.py @@ -375,6 +375,44 @@ def generate_diff(test_id: int, regression_test_id: int, output_id: int, to_view abort(404) +@mod_test.route('/diff////smart') +def smart_diff_view(test_id: int, regression_test_id: int, output_id: int): + """ + Return a semantic (smart) diff classification for an output as JSON. + + Unlike the line diff, this reports *how* the output differs (timing shift, + cosmetic padding/formatting/encoding, text change, missing/extra cues), so a + person or an agent gets an actionable answer instead of a wall of lines. + + :param test_id: id of the test + :type test_id: int + :param regression_test_id: id of the regression test + :type regression_test_id: int + :param output_id: id of the generated output + :type output_id: int + :return: JSON classification of the difference. + :rtype: flask.Response + """ + from run import config + + result = TestResultFile.query.filter(and_( + TestResultFile.test_id == test_id, + TestResultFile.regression_test_id == regression_test_id, + TestResultFile.regression_test_output_id == output_id + )).first() + + if result is None: + abort(404) + + path = os.path.join(config.get('SAMPLE_REPOSITORY', ''), 'TestResults') + try: + classification = result.generate_smart_diff(path) + except OSError: + classification = {'kind': 'unavailable', + 'summary': 'Output files are not available locally.'} + return jsonify(classification) + + @mod_test.route('/log-files/') @login_required def download_build_log_file(test_id): diff --git a/mod_test/models.py b/mod_test/models.py index 1463a0f3..b6ad6385 100644 --- a/mod_test/models.py +++ b/mod_test/models.py @@ -455,3 +455,30 @@ def read_lines(file_name: str) -> List[str]: return open(file_name, encoding='utf8').readlines() except UnicodeDecodeError: return open(file_name, encoding='cp1252').readlines() + + def generate_smart_diff(self, base_path: str) -> dict: + """ + Classify *how* the actual output differs from the expected baseline. + + Unlike the line diff, this returns a semantic classification (timing + shift, cosmetic padding/formatting/encoding, text change, missing/extra + cues) that a person or an agent can act on directly. + + :param base_path: The base path for the files location. + :type base_path: str + :return: A smart-diff classification with ``kind`` and ``summary`` keys. + :rtype: dict + """ + from mod_test.smartdiff.compare import smart_diff + + if not self.got: + return {'kind': 'identical', + 'summary': 'Output matches the expected baseline.'} + + extension = self.regression_test_output.correct_extension + file_ok = os.path.join(base_path, self.expected + extension) + file_fail = os.path.join(base_path, self.got + extension) + expected_text = ''.join(self.read_lines(file_ok)) + actual_text = ''.join(self.read_lines(file_fail)) + return smart_diff(expected_text, actual_text, + fmt=extension.lstrip('.').lower() or None) diff --git a/templates/test/by_id.html b/templates/test/by_id.html index a54df860..1228d4ce 100644 --- a/templates/test/by_id.html +++ b/templates/test/by_id.html @@ -149,14 +149,14 @@

Fail + Fail · Smart {%- endif %} {% elif file.got is none or no_error.found or test.result.exit_code != 0 -%} Pass {% elif file.got == "error" %} No output generated but there should be {% else %} - Fail + Fail · Smart {%- endif %} {% if not loop.last %}
{% endif %} {% else %} @@ -298,6 +298,32 @@

There are no tests executed in this category.
popup.open(); }); }); + $('.smart_diff_link').on('click', function(){ + // Fetch the semantic (smart) diff classification and show a summary. + var url = '{{ url_for('test.smart_diff_view', test_id='_0_', regression_test_id='_1_', output_id='_2_') }}'; + url = url.replace('_0_', $(this).data('test')).replace('_1_', $(this).data('regression')).replace('_2_', $(this).data('output')); + + $.getJSON(url).done(function(resp){ + var id, reveal, popup; + + reveal = document.createElement('div'); + id = 'smart-diff-popup-'+(new Date()).getTime(); + reveal.setAttribute('id', id); + reveal.setAttribute('class', 'reveal'); + reveal.setAttribute('data-reveal', ''); + reveal.innerHTML = + '

Smart diff

' + + '

' + (resp.kind || 'unknown') + '

' + + '

' + (resp.summary || '') + '

'; + reveal.innerHTML += + ''; + document.body.appendChild(reveal); + popup = new Foundation.Reveal($('#'+id)); + popup.open(); + }); + }); }); {% endblock %} diff --git a/tests/test_smartdiff/test_model_integration.py b/tests/test_smartdiff/test_model_integration.py new file mode 100644 index 00000000..0b8c715e --- /dev/null +++ b/tests/test_smartdiff/test_model_integration.py @@ -0,0 +1,63 @@ +"""Tests for TestResultFile.generate_smart_diff (the model glue) against real files. + +The method is exercised with a lightweight stand-in ``self`` so the test stays a +fast unit test (no database/ORM mapper configuration required). +""" + +import os +import tempfile +import unittest +from unittest import mock + +from mod_test.models import TestResultFile + +_CUE = "1\n00:00:01,000 --> 00:00:04,000\nHello world\n" + + +def _run(expected_text, got_text, ext='.srt', got='GOT'): + """ + Write two outputs to a temp dir and run generate_smart_diff over them. + + :param expected_text: Expected output content. + :type expected_text: str + :param got_text: Actual output content. + :type got_text: str + :param ext: Output file extension. + :type ext: str + :param got: The 'got' hash (set to None to simulate no produced output). + :type got: str + :return: The smart-diff classification. + :rtype: dict + """ + base = tempfile.mkdtemp() + with open(os.path.join(base, 'EXP' + ext), 'w', encoding='utf-8') as handle: + handle.write(expected_text) + with open(os.path.join(base, 'GOT' + ext), 'w', encoding='utf-8') as handle: + handle.write(got_text) + stub = mock.Mock() + stub.expected = 'EXP' + stub.got = got + stub.regression_test_output.correct_extension = ext + stub.read_lines = TestResultFile.read_lines + return TestResultFile.generate_smart_diff(stub, base) + + +class GenerateSmartDiffTests(unittest.TestCase): + """The model method reads the on-disk outputs and classifies the difference.""" + + def test_identical(self): + """Equal on-disk outputs classify as identical.""" + self.assertEqual(_run(_CUE, _CUE)['kind'], 'identical') + + def test_timing_shift(self): + """A shifted output is classified as a timing shift.""" + shifted = "1\n00:00:01,500 --> 00:00:04,500\nHello world\n" + self.assertEqual(_run(_CUE, shifted)['kind'], 'timing_shift') + + def test_missing_got_is_identical(self): + """A null 'got' (no produced output) short-circuits to identical.""" + self.assertEqual(_run(_CUE, _CUE, got=None)['kind'], 'identical') + + +if __name__ == "__main__": + unittest.main() From 1b1e75e231f57121cb0ec559137a8439a36fd859 Mon Sep 17 00:00:00 2001 From: GAURAV KARMAKAR Date: Tue, 30 Jun 2026 01:22:41 +0530 Subject: [PATCH 08/10] feat(smartdiff): per-cue change detail (which cues, expected/actual, offset) smart_diff now returns a capped 'changes' list alongside the verdict: each changed cue with its kind, a per-cue timing offset, and (for text changes) expected/actual snippets. This gives an agent the structured detail to act on without scraping the raw HTML diff. The web "Smart" popup lists these changes (HTML-escaped). Result shape stays backward compatible (additive). --- mod_test/smartdiff/compare.py | 83 +++++++++++++++++++++------- templates/test/by_id.html | 10 ++++ tests/test_smartdiff/test_compare.py | 20 +++++++ 3 files changed, 92 insertions(+), 21 deletions(-) diff --git a/mod_test/smartdiff/compare.py b/mod_test/smartdiff/compare.py index db0b1366..4fc79f9f 100644 --- a/mod_test/smartdiff/compare.py +++ b/mod_test/smartdiff/compare.py @@ -6,6 +6,9 @@ from mod_test.smartdiff.parsing import parse_subtitles from mod_test.smartdiff.srt import Cue +#: Cap on the number of per-cue change entries returned in a result. +_MAX_CHANGES = 25 + def _result(kind: str, summary: str, n_exp: int, n_act: int, offset_ms: Optional[int] = None) -> Dict[str, object]: @@ -63,6 +66,21 @@ def _monotonic(values: List[int]) -> bool: return non_decreasing or non_increasing +def _snippet(text: str, limit: int = 80) -> str: + """ + Collapse whitespace and truncate cue text for compact change details. + + :param text: Raw cue text. + :type text: str + :param limit: Maximum characters to keep. + :type limit: int + :return: A single-line, length-capped snippet. + :rtype: str + """ + flat = ' '.join(text.split()) + return flat if len(flat) <= limit else flat[:limit] + '…' + + def smart_diff(expected: str, actual: str, fmt: Optional[str] = None) -> Dict[str, object]: """ @@ -98,20 +116,33 @@ def smart_diff(expected: str, actual: str, encoding_changes = 0 raw_matches = True timing_deltas: List[int] = [] - for expected_cue, actual_cue in zip(exp, act): + changes: List[Dict[str, object]] = [] + for position, (expected_cue, actual_cue) in enumerate(zip(exp, act), start=1): category = classify_text_pair(expected_cue.text, actual_cue.text) + delta = actual_cue.start_ms - expected_cue.start_ms if category != 'match': raw_matches = False if category == 'text': text_changes += 1 - continue - if category == 'formatting': - formatting_changes += 1 - elif category == 'whitespace': - whitespace_changes += 1 - elif category == 'encoding': - encoding_changes += 1 - timing_deltas.append(actual_cue.start_ms - expected_cue.start_ms) + else: + if category == 'formatting': + formatting_changes += 1 + elif category == 'whitespace': + whitespace_changes += 1 + elif category == 'encoding': + encoding_changes += 1 + timing_deltas.append(delta) + if category != 'match' or delta != 0: + entry: Dict[str, object] = { + 'cue': position, + 'kind': category if category != 'match' else 'timing', + } + if category == 'text': + entry['expected'] = _snippet(expected_cue.text) + entry['actual'] = _snippet(actual_cue.text) + if delta != 0: + entry['offset_ms'] = delta + changes.append(entry) no_timing_move = all(delta == 0 for delta in timing_deltas) uniform_shift = bool(timing_deltas) and len(set(timing_deltas)) == 1 @@ -120,20 +151,30 @@ def smart_diff(expected: str, actual: str, cosmetic_changes = formatting_changes + whitespace_changes + encoding_changes fully_aligned = text_changes == 0 and cosmetic_changes == 0 + def _finish(kind: str, summary: str, exp_count: int, act_count: int, + offset_ms: Optional[int] = None) -> Dict[str, object]: + """Attach the (capped) per-cue change list to a classification result.""" + out = _result(kind, summary, exp_count, act_count, offset_ms) + if changes: + out['changes'] = changes[:_MAX_CHANGES] + if len(changes) > _MAX_CHANGES: + out['changes_truncated'] = True + return out + if not count_mismatch and raw_matches and no_timing_move: - return _result('identical', 'Outputs are identical.', n_exp, n_act) + return _finish('identical', 'Outputs are identical.', n_exp, n_act) if not count_mismatch and fully_aligned and uniform_shift and timing_deltas[0] != 0: offset = timing_deltas[0] direction = 'late' if offset > 0 else 'early' - return _result( + return _finish( 'timing_shift', f'All {n_exp} cues match but are {abs(offset)} ms {direction}.', n_exp, n_act, offset_ms=offset) if not count_mismatch and fully_aligned and drifting: first, last = timing_deltas[0], timing_deltas[-1] - return _result( + return _finish( 'timing_drift', f'Timing drifts from {first:+d} ms to {last:+d} ms across {n_exp} cues.', n_exp, n_act) @@ -141,51 +182,51 @@ def smart_diff(expected: str, actual: str, if count_mismatch: if _content(exp) and _content(exp) == _content(act): if n_act > n_exp: - return _result( + return _finish( 'split_cues', f'Same text, but cues were split: expected {n_exp}, got {n_act}.', n_exp, n_act) - return _result( + return _finish( 'merged_cues', f'Same text, but cues were merged: expected {n_exp}, got {n_act}.', n_exp, n_act) if text_changes == 0: if n_act < n_exp: - return _result( + return _finish( 'missing_cues', f'{n_exp - n_act} of {n_exp} cues are missing from the output.', n_exp, n_act) - return _result( + return _finish( 'extra_cues', f'Output has {n_act - n_exp} extra cues ({n_act} vs {n_exp} expected).', n_exp, n_act) if not count_mismatch and no_timing_move: if text_changes > 0: - return _result( + return _finish( 'text_change', f'{text_changes} of {n_exp} cues differ in text (timing aligned).', n_exp, n_act) if encoding_changes > 0 and formatting_changes == 0 and whitespace_changes == 0: - return _result( + return _finish( 'encoding_change', f'{encoding_changes} of {n_exp} cues differ only in character ' f'encoding (non-ASCII/accented characters).', n_exp, n_act) if formatting_changes > 0 and whitespace_changes == 0 and encoding_changes == 0: - return _result( + return _finish( 'formatting_change', f'{formatting_changes} of {n_exp} cues differ only in formatting ' f'(tags/entities), not text.', n_exp, n_act) if whitespace_changes > 0 and formatting_changes == 0 and encoding_changes == 0: - return _result( + return _finish( 'whitespace_change', f'{whitespace_changes} of {n_exp} cues differ only in trailing ' f'whitespace/padding.', n_exp, n_act) - return _result( + return _finish( 'mixed', f'Mixed differences across {min(n_exp, n_act)} compared cues; ' f'expected {n_exp}, got {n_act}.', diff --git a/templates/test/by_id.html b/templates/test/by_id.html index 1228d4ce..c9376231 100644 --- a/templates/test/by_id.html +++ b/templates/test/by_id.html @@ -315,6 +315,16 @@
There are no tests executed in this category.
'

Smart diff

' + '

' + (resp.kind || 'unknown') + '

' + '

' + (resp.summary || '') + '

'; + if (resp.changes && resp.changes.length) { + var items = resp.changes.map(function(c){ + var d = 'Cue ' + c.cue + ': ' + c.kind; + if (c.offset_ms !== undefined) { d += ' (' + (c.offset_ms > 0 ? '+' : '') + c.offset_ms + ' ms)'; } + if (c.expected !== undefined) { d += ' — expected “' + c.expected + '”, got “' + c.actual + '”'; } + return '
  • ' + $('
    ').text(d).html() + '
  • '; + }).join(''); + reveal.innerHTML += '
      ' + items + '
    '; + if (resp.changes_truncated) { reveal.innerHTML += '

    … more changes not shown.

    '; } + } reveal.innerHTML += '