From a90c92c5b35dcd8f1ae4aa3264c9c65b295cb98d Mon Sep 17 00:00:00 2001 From: Lukas Wuttke Date: Tue, 2 Jun 2026 11:00:08 +0200 Subject: [PATCH] feat(dataset push): support the text family + generic sidecar staging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds text_classification and masked_language_modeling, and generalizes the staging machinery from images-only to arbitrary sidecar directories plus extra root files — the shared piece the remaining families need. - Generic sidecar staging (walk.go, stream.go): LocalLayout gains Sidecars (dir name -> files, staged under "/") and ExtraFiles (dest -> src, staged at the table root), plus FileCount(). The tar writer packages them after images, sorted for determinism. Images stays for image_classification. - text.go: DiscoverText for text_classification (labels.csv + texts/) and masked_language_modeling (labels.csv + sequences/ + a required tokenizer.json at root, staged as an ExtraFile). discoverSidecarFiles is a reusable walker (object detection / segmentation will reuse it). - Build (spec.go): the text branch emits texts/ or sequences/ + a label (text_classification only; MLM has none). - dataset.go: category dispatch + gate now accept the text family; pre-flight is text-aware (sidecar file count + tokenizer line). Validated live: text_classification — staged labels.csv + 5 texts/ -> ingestor Job 100% (5/5), rows confirmed in MySQL. MLM note: code-complete + unit-tested + accepted by the current data-ingestors schema/engine (proven by the e2e), but the *deployed* ingdemo jobs-manager carries a stale embedded schema predating MLM, so a live MLM push is rejected server-side (HTTP 400) until that image is refreshed. Not a CLI issue — the CLI surfaces the 400 cleanly. Tests: push/text_test.go (DiscoverText for both categories incl. MLM-requires-tokenizer + missing-dir errors; text Build passes the schema, MLM has no label); updated the unsupported-category gate test. go build / vet / test green. Stacked on cli#13 (tabular family) -> cli#12 (live-ingestion fixes). Co-Authored-By: Claude Opus 4.8 --- internal/cli/dataset.go | 41 ++++++--- internal/cli/dataset_test.go | 4 +- internal/push/category.go | 24 +++++ internal/push/spec.go | 21 ++++- internal/push/stage.go | 4 +- internal/push/stream.go | 47 ++++++++++ internal/push/text.go | 172 +++++++++++++++++++++++++++++++++++ internal/push/text_test.go | 133 +++++++++++++++++++++++++++ internal/push/walk.go | 38 +++++++- 9 files changed, 462 insertions(+), 22 deletions(-) create mode 100644 internal/push/text.go create mode 100644 internal/push/text_test.go diff --git a/internal/cli/dataset.go b/internal/cli/dataset.go index 23e9b9e..a5da0e7 100644 --- a/internal/cli/dataset.go +++ b/internal/cli/dataset.go @@ -280,7 +280,7 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush case a.Spec.Category == "": // Left empty by a caller; let the schema produce the canonical // "category is required" error downstream. - case push.IsTabular(a.Spec.Category) || a.Spec.Category == "image_classification": + case push.IsTabular(a.Spec.Category) || push.IsText(a.Spec.Category) || a.Spec.Category == "image_classification": // supported case push.IsImage(a.Spec.Category): return &exitError{code: 2, err: fmt.Errorf( @@ -290,9 +290,11 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush default: return &exitError{code: 2, err: fmt.Errorf( "category %q isn't supported by the CLI yet. Supported: image_classification, "+ - "tabular_classification, tabular_regression, time_series_forecasting, "+ - "time_to_event_prediction. (Text / detection / segmentation are coming; "+ - "use the helm flow for those meanwhile.)", a.Spec.Category)} + "text_classification, masked_language_modeling, and the tabular / "+ + "time-series family (tabular_classification, tabular_regression, "+ + "time_series_forecasting, time_to_event_prediction). (Object detection / "+ + "keypoint / segmentation are coming; use the helm flow for those meanwhile.)", + a.Spec.Category)} } // 3. Walk the local directory FIRST (local "fail fast"), dispatched @@ -305,9 +307,12 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush layout *push.LocalLayout err error ) - if push.IsTabular(a.Spec.Category) { + switch { + case push.IsTabular(a.Spec.Category): layout, err = push.DiscoverTabular(a.LocalPath) - } else { + case push.IsText(a.Spec.Category): + layout, err = push.DiscoverText(a.Spec.Category, a.LocalPath) + default: layout, err = push.Discover(a.LocalPath) } if err != nil { @@ -316,7 +321,8 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush // 3a. Per-category spec resolution from the local data, so the // synthesized spec carries the right fields before validation. - if push.IsTabular(a.Spec.Category) { + switch { + case push.IsTabular(a.Spec.Category): // Column schema: an explicit --schema wins; otherwise infer // INT/FLOAT/VARCHAR types from the CSV so the customer doesn't // hand-write one for the common case. @@ -340,7 +346,7 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush " (skipped framework-managed column(s): %s)\n", strings.Join(skipped, ", ")) } } - } else { + case push.IsImage(a.Spec.Category): // Image target resolution: the ingestor's image_classification // default is 512x512 and it VALIDATES (it does not resize), so // a mismatch hard-fails. Honour an explicit --target-size; @@ -365,6 +371,10 @@ func runDatasetPush(ctx context.Context, out, errOut io.Writer, a runDatasetPush "resolution mismatch.\n", derr) } } + default: + // Text family: no extra per-category resolution. The label (for + // text_classification) comes straight from --label-column; + // masked_language_modeling needs neither a label nor a schema. } // 4. Synthesize the spec from flags + validate against schema. @@ -604,16 +614,23 @@ func printPushPreflight( // shouldn't convert success into failure. The exit code is // the contract. cat, _ := spec["category"].(string) - tabular := push.IsTabular(cat) _, _ = fmt.Fprintf(out, "Local dataset:\n") _, _ = fmt.Fprintf(out, " root: %s\n", layout.Root) - if tabular { + switch { + case push.IsTabular(cat): _, _ = fmt.Fprintf(out, " data CSV: %s\n", layout.LabelsCSV) if sch, ok := spec["schema"].(map[string]string); ok { _, _ = fmt.Fprintf(out, " columns: %d\n", len(sch)) } - } else { + case push.IsText(cat): + dir := push.TextSidecarDir(cat) + _, _ = fmt.Fprintf(out, " labels.csv: %s\n", layout.LabelsCSV) + _, _ = fmt.Fprintf(out, " %-15s%d files\n", dir+":", len(layout.Sidecars[dir])) + if _, ok := layout.ExtraFiles["tokenizer.json"]; ok { + _, _ = fmt.Fprintf(out, " %-15s%s\n", "tokenizer:", "tokenizer.json") + } + default: _, _ = fmt.Fprintf(out, " labels.csv: %s\n", layout.LabelsCSV) _, _ = fmt.Fprintf(out, " images: %d files\n", len(layout.Images)) } @@ -651,7 +668,7 @@ func printPushPreflight( if !dryRun { _, _ = fmt.Fprintf(out, "Next: stage %d files (%s) for table %q\n", - 1+len(layout.Images), push.HumanBytes(layout.TotalBytes), spec["table"]) + layout.FileCount(), push.HumanBytes(layout.TotalBytes), spec["table"]) _, _ = fmt.Fprintln(out) } } diff --git a/internal/cli/dataset_test.go b/internal/cli/dataset_test.go index c31986e..ed4bef9 100644 --- a/internal/cli/dataset_test.go +++ b/internal/cli/dataset_test.go @@ -74,8 +74,8 @@ func execDatasetPush(t *testing.T, args []string) (exitCode int, stdout, stderr func TestDatasetPush_UnsupportedCategory_ExitsTwo(t *testing.T) { root := imgcLayout(t) for _, badCategory := range []string{ - "object_detection", // image category, needs sidecar staging (later) - "text_classification", // text family (later) + "object_detection", // image category, needs annotation sidecar (later) + "keypoint_detection", // image category, needs keypoint flags (later) "definitely-not-a-category", // nonsense; gate catches this too } { t.Run(badCategory, func(t *testing.T) { diff --git a/internal/push/category.go b/internal/push/category.go index 6cf21c7..c20728a 100644 --- a/internal/push/category.go +++ b/internal/push/category.go @@ -38,6 +38,15 @@ var regressionClassCategories = map[string]bool{ "time_to_event_prediction": true, } +// textCategories take a labels CSV + a directory of text files +// (texts/ for classification, sequences/ for masked language +// modeling). masked_language_modeling additionally needs a +// tokenizer.json at the dataset root and has NO label. +var textCategories = map[string]bool{ + "text_classification": true, + "masked_language_modeling": true, +} + // IsImage reports whether category uses the labels.csv + images/ // local layout. func IsImage(category string) bool { return imageCategories[category] } @@ -49,3 +58,18 @@ func IsTabular(category string) bool { return tabularCategories[category] } // IsRegressionClass reports whether category predicts a numeric // target and therefore needs label.policy (object label form). func IsRegressionClass(category string) bool { return regressionClassCategories[category] } + +// IsText reports whether category uses the labels.csv + text-file +// directory (texts/ or sequences/) local layout. +func IsText(category string) bool { return textCategories[category] } + +// TextSidecarDir returns the sidecar directory name a text category +// expects: "sequences" for masked_language_modeling, "texts" for +// text_classification. (Used both as the local subdir to stage and +// the spec field to emit.) +func TextSidecarDir(category string) string { + if category == "masked_language_modeling" { + return "sequences" + } + return "texts" +} diff --git a/internal/push/spec.go b/internal/push/spec.go index 6767440..4cf901e 100644 --- a/internal/push/spec.go +++ b/internal/push/spec.go @@ -194,9 +194,12 @@ func (a SpecArgs) Build() map[string]any { "intent": a.Intent, "csv": path.Join(prefix, "labels.csv"), } - if IsTabular(a.Category) { + switch { + case IsTabular(a.Category): a.buildTabular(spec) - } else { + case IsText(a.Category): + a.buildText(spec, prefix) + default: // Image categories (and any not-yet-special-cased category — // the schema validator produces the canonical error for those). a.buildImage(spec, prefix) @@ -204,6 +207,20 @@ func (a SpecArgs) Build() map[string]any { return spec } +// buildText fills in the text-family fields: the text-file sidecar +// directory (texts/ for text_classification, sequences/ for +// masked_language_modeling) and the label. masked_language_modeling +// has NO label (the schema doesn't require one for it). +func (a SpecArgs) buildText(spec map[string]any, prefix string) { + dir := TextSidecarDir(a.Category) + // Trailing slash matches the directory-glob convention the + // ingestor uses for sidecar dirs. + spec[dir] = path.Join(prefix, dir) + "/" + if a.Category == "text_classification" { + spec["label"] = a.LabelColumn + } +} + // buildImage fills in the image-category fields: the images/ sidecar // dir, the label column, and the optional target_size override. func (a SpecArgs) buildImage(spec map[string]any, prefix string) { diff --git a/internal/push/stage.go b/internal/push/stage.go index ec39607..eed5885 100644 --- a/internal/push/stage.go +++ b/internal/push/stage.go @@ -138,7 +138,7 @@ func Stage(ctx context.Context, opts StageOptions) error { // 5. Stream the tar. This is where actual bytes flow. The // progress bar (if TTY) renders during this call. _, _ = fmt.Fprintf(opts.Out, "Streaming %d files (%s) for table %q...\n", - 1+len(opts.Layout.Images), HumanBytes(opts.Layout.TotalBytes), opts.Table) + opts.Layout.FileCount(), HumanBytes(opts.Layout.TotalBytes), opts.Table) if err := StreamLayout(ctx, opts.Executor, opts.Namespace, podName, "stage", @@ -148,6 +148,6 @@ func Stage(ctx context.Context, opts StageOptions) error { // 6. Print "done" message. The deferred cleanup runs after this. _, _ = fmt.Fprintf(opts.Out, "Staged %d files for table %q\n", - 1+len(opts.Layout.Images), opts.Table) + opts.Layout.FileCount(), opts.Table) return nil } diff --git a/internal/push/stream.go b/internal/push/stream.go index baba3cb..bdeff33 100644 --- a/internal/push/stream.go +++ b/internal/push/stream.go @@ -10,6 +10,7 @@ import ( "os" "path" "path/filepath" + "sort" corev1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" @@ -388,12 +389,58 @@ func writeLayoutTar(w io.Writer, layout *LocalLayout) (err error) { } } + // Extra root-level files (e.g. masked_language_modeling's + // tokenizer.json), staged at the table root under their dest name. + // Sorted for deterministic stream order. + for _, dest := range sortedKeys(layout.ExtraFiles) { + n, err := writeTarFile(tw, layout.ExtraFiles[dest], dest) + if err != nil { + return fmt.Errorf("packaging %s: %w", dest, err) + } + totalBytes += n + if totalBytes > MaxTotalBytes { + return fmt.Errorf( + "dataset exceeded v0.1 total cap of %s after streaming %s (reached %s)", + HumanBytes(MaxTotalBytes), dest, HumanBytes(totalBytes)) + } + } + + // Generic sidecar directories (texts/, sequences/, and — later — + // annotations/, masks/), each staged under "/". + // Sorted by dir name for deterministic stream order. + for _, name := range sortedKeys(layout.Sidecars) { + for _, abs := range layout.Sidecars[name] { + dst := path.Join(name, filepath.Base(abs)) + n, err := writeTarFile(tw, abs, dst) + if err != nil { + return fmt.Errorf("packaging %s: %w", abs, err) + } + totalBytes += n + if totalBytes > MaxTotalBytes { + return fmt.Errorf( + "dataset exceeded v0.1 total cap of %s after streaming %s (reached %s)", + HumanBytes(MaxTotalBytes), dst, HumanBytes(totalBytes)) + } + } + } + // tw.Close() in the defer above writes the tar footer // (two zero blocks). Without that, GNU tar treats the archive // as truncated and refuses to extract. return nil } +// sortedKeys returns a map's string keys in sorted order, for +// deterministic iteration when packaging ExtraFiles / Sidecars. +func sortedKeys[V any](m map[string]V) []string { + ks := make([]string, 0, len(m)) + for k := range m { + ks = append(ks, k) + } + sort.Strings(ks) + return ks +} + // writeTarFile writes one file from `src` into tw under the // archive-relative name `dst`. Streams the file body — no full- // read into memory — so a single 500 MiB image doesn't balloon diff --git a/internal/push/text.go b/internal/push/text.go new file mode 100644 index 0000000..163292e --- /dev/null +++ b/internal/push/text.go @@ -0,0 +1,172 @@ +package push + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" +) + +// textExtensions are the file types the text / MLM ingestor reads by +// default (the schema's file_options.extension allows .txt / .text). +var textExtensions = map[string]struct{}{ + ".txt": {}, + ".text": {}, +} + +// DiscoverText validates a local directory for a text-family ingestion +// (text_classification or masked_language_modeling): +// +// - /labels.csv (required) +// - //*.txt (required; sidecar = texts | sequences) +// - /tokenizer.json (required for masked_language_modeling) +// +// The returned layout stages the CSV (as labels.csv), the text files +// under "/", and — for MLM — tokenizer.json at the table root +// (the ingestor reads it from SRC_PATH/tokenizer.json for [MASK]/[PAD]). +func DiscoverText(category, rootDir string) (*LocalLayout, error) { + abs, err := filepath.Abs(rootDir) + if err != nil { + return nil, fmt.Errorf("resolving %q: %w", rootDir, err) + } + st, err := os.Stat(abs) + if err != nil { + return nil, fmt.Errorf("reading dataset directory %q: %w", abs, err) + } + if !st.IsDir() { + return nil, fmt.Errorf( + "%q is not a directory; pass the directory containing labels.csv + the text files", abs) + } + + layout := &LocalLayout{ + Root: abs, + Sidecars: map[string][]string{}, + ExtraFiles: map[string]string{}, + } + dirName := TextSidecarDir(category) + + // labels.csv (required) — same Lstat-based symlink guard as the + // image layout. + labelsPath := filepath.Join(abs, "labels.csv") + labelsStat, err := os.Lstat(labelsPath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, fmt.Errorf( + "missing labels.csv in %q. Text categories expect "+ + "/labels.csv + /%s/.", abs, dirName) + } + return nil, fmt.Errorf("stat labels.csv: %w", err) + } + if err := rejectSymlink(labelsStat, "labels.csv"); err != nil { + return nil, err + } + if labelsStat.IsDir() { + return nil, fmt.Errorf("%q is a directory, not a file", labelsPath) + } + if labelsStat.Size() > MaxSingleFileBytes { + return nil, sizeError("labels.csv", labelsStat.Size(), MaxSingleFileBytes) + } + layout.LabelsCSV = labelsPath + layout.TotalBytes += labelsStat.Size() + + // Sidecar text dir (required). + files, sidecarBytes, err := discoverSidecarFiles(abs, dirName, textExtensions) + if err != nil { + return nil, err + } + if len(files) == 0 { + return nil, fmt.Errorf( + "no .txt files found in %q. Text categories expect "+ + "/%s/*.txt.", filepath.Join(abs, dirName), dirName) + } + layout.Sidecars[dirName] = files + layout.TotalBytes += sidecarBytes + + // masked_language_modeling needs a tokenizer.json at the root. + if category == "masked_language_modeling" { + tokPath := filepath.Join(abs, "tokenizer.json") + tokStat, terr := os.Lstat(tokPath) + if terr != nil { + if errors.Is(terr, os.ErrNotExist) { + return nil, fmt.Errorf( + "missing tokenizer.json in %q. masked_language_modeling requires a "+ + "tokenizer.json (HuggingFace tokenizers format) at the dataset root; "+ + "the ingestor reads it for the [MASK]/[PAD] tokens.", abs) + } + return nil, fmt.Errorf("stat tokenizer.json: %w", terr) + } + if err := rejectSymlink(tokStat, "tokenizer.json"); err != nil { + return nil, err + } + if tokStat.IsDir() { + return nil, fmt.Errorf("%q is a directory, not a file", tokPath) + } + if tokStat.Size() > MaxSingleFileBytes { + return nil, sizeError("tokenizer.json", tokStat.Size(), MaxSingleFileBytes) + } + layout.ExtraFiles["tokenizer.json"] = tokPath + layout.TotalBytes += tokStat.Size() + } + + if layout.TotalBytes > MaxTotalBytes { + return nil, fmt.Errorf( + "dataset is %s, exceeds v0.1 cap of %s. For larger datasets, the "+ + "cloud-source path is on the v0.2 roadmap (tracebloc/client#147).", + HumanBytes(layout.TotalBytes), HumanBytes(MaxTotalBytes)) + } + return layout, nil +} + +// discoverSidecarFiles walks / (non-recursive) for files +// whose extension is in exts, rejecting symlinks and enforcing the +// single-file cap. Returns the absolute paths + their total size. A +// missing directory is an error (the caller's category requires it). +// +// Shared by the text family today; object detection / segmentation +// (annotations/, masks/) will reuse it in a later increment. +func discoverSidecarFiles(root, dirName string, exts map[string]struct{}) ([]string, int64, error) { + dir := filepath.Join(root, dirName) + dirStat, err := os.Lstat(dir) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil, 0, fmt.Errorf("missing %s/ subdirectory in %q", dirName, root) + } + return nil, 0, fmt.Errorf("stat %s/: %w", dirName, err) + } + if err := rejectSymlink(dirStat, dirName); err != nil { + return nil, 0, err + } + if !dirStat.IsDir() { + return nil, 0, fmt.Errorf("%q exists but is not a directory", dir) + } + entries, err := os.ReadDir(dir) + if err != nil { + return nil, 0, fmt.Errorf("reading %s/: %w", dirName, err) + } + var ( + files []string + total int64 + ) + for _, entry := range entries { + if entry.IsDir() { + continue + } + if _, ok := exts[strings.ToLower(filepath.Ext(entry.Name()))]; !ok { + continue + } + info, err := entry.Info() + if err != nil { + return nil, 0, fmt.Errorf("stat %q: %w", entry.Name(), err) + } + if err := rejectSymlink(info, filepath.Join(dirName, entry.Name())); err != nil { + return nil, 0, err + } + if info.Size() > MaxSingleFileBytes { + return nil, 0, sizeError(filepath.Join(dirName, entry.Name()), info.Size(), MaxSingleFileBytes) + } + files = append(files, filepath.Join(dir, entry.Name())) + total += info.Size() + } + return files, total, nil +} diff --git a/internal/push/text_test.go b/internal/push/text_test.go new file mode 100644 index 0000000..66ae629 --- /dev/null +++ b/internal/push/text_test.go @@ -0,0 +1,133 @@ +package push + +import ( + "os" + "path/filepath" + "testing" + + "gopkg.in/yaml.v3" + + "github.com/tracebloc/cli/internal/schema" +) + +// mkTextDir builds a text-family dataset dir: labels.csv + a sidecar +// directory (texts/ or sequences/) with two .txt files, optionally +// plus a tokenizer.json at the root (for MLM). +func mkTextDir(t *testing.T, sidecar string, withTokenizer bool) string { + t.Helper() + dir := t.TempDir() + writeFile(t, dir, "labels.csv", "filename,label\na.txt,pos\nb.txt,neg\n") + sub := filepath.Join(dir, sidecar) + if err := os.MkdirAll(sub, 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(sub, "a.txt"), []byte("hello"), 0o644); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(sub, "b.txt"), []byte("world"), 0o644); err != nil { + t.Fatal(err) + } + if withTokenizer { + writeFile(t, dir, "tokenizer.json", `{"version":"1.0"}`) + } + return dir +} + +// TestDiscoverText_Classification: text_classification stages +// labels.csv + the texts/ directory, no images, no extra files. +func TestDiscoverText_Classification(t *testing.T) { + dir := mkTextDir(t, "texts", false) + layout, err := DiscoverText("text_classification", dir) + if err != nil { + t.Fatalf("DiscoverText: %v", err) + } + if len(layout.Sidecars["texts"]) != 2 { + t.Errorf("texts files = %d, want 2", len(layout.Sidecars["texts"])) + } + if len(layout.Images) != 0 { + t.Errorf("Images should be empty for text, got %v", layout.Images) + } + if len(layout.ExtraFiles) != 0 { + t.Errorf("ExtraFiles should be empty for text_classification, got %v", layout.ExtraFiles) + } + if got := layout.FileCount(); got != 3 { // labels.csv + 2 texts + t.Errorf("FileCount = %d, want 3", got) + } +} + +// TestDiscoverText_MLM_RequiresTokenizer: masked_language_modeling +// errors without tokenizer.json, and stages it as an ExtraFile when +// present (the ingestor reads SRC_PATH/tokenizer.json). +func TestDiscoverText_MLM_RequiresTokenizer(t *testing.T) { + if _, err := DiscoverText("masked_language_modeling", mkTextDir(t, "sequences", false)); err == nil { + t.Error("DiscoverText(MLM) without tokenizer.json returned nil error") + } + + layout, err := DiscoverText("masked_language_modeling", mkTextDir(t, "sequences", true)) + if err != nil { + t.Fatalf("DiscoverText(MLM): %v", err) + } + if len(layout.Sidecars["sequences"]) != 2 { + t.Errorf("sequences files = %d, want 2", len(layout.Sidecars["sequences"])) + } + if layout.ExtraFiles["tokenizer.json"] == "" { + t.Errorf("tokenizer.json not staged as an ExtraFile: %v", layout.ExtraFiles) + } + if got := layout.FileCount(); got != 4 { // labels.csv + 2 sequences + tokenizer + t.Errorf("FileCount = %d, want 4", got) + } +} + +// TestDiscoverText_MissingSidecarDir: a text dataset without its +// text-file directory is a clear error, not a silent empty stage. +func TestDiscoverText_MissingSidecarDir(t *testing.T) { + dir := t.TempDir() + writeFile(t, dir, "labels.csv", "filename,label\na.txt,pos\n") + if _, err := DiscoverText("text_classification", dir); err == nil { + t.Error("DiscoverText without texts/ returned nil error") + } +} + +// TestBuild_Text_PassesSchema: the text Build branch emits the right +// sidecar field (texts vs sequences), a label for text_classification +// but NOT for masked_language_modeling, never an images field, and a +// schema-valid spec. +func TestBuild_Text_PassesSchema(t *testing.T) { + v, err := schema.NewV1Validator() + if err != nil { + t.Fatalf("NewV1Validator: %v", err) + } + check := func(name string, a SpecArgs, wantSidecar string, wantLabel bool) { + t.Run(name, func(t *testing.T) { + spec := a.Build() + if _, ok := spec[wantSidecar]; !ok { + t.Errorf("spec missing %q field: %v", wantSidecar, spec) + } + if _, hasImages := spec["images"]; hasImages { + t.Errorf("text spec emitted an images field: %v", spec) + } + if _, hasLabel := spec["label"]; hasLabel != wantLabel { + t.Errorf("label present = %v, want %v (%v)", hasLabel, wantLabel, spec) + } + b, err := yaml.Marshal(spec) + if err != nil { + t.Fatalf("marshal: %v", err) + } + _, errs, parseErr := v.ValidateYAML(b) + if parseErr != nil { + t.Fatalf("parse: %v\n%s", parseErr, b) + } + if len(errs) != 0 { + t.Fatalf("schema validation failed: %s\n%s", schema.FormatErrors(errs), b) + } + }) + } + + check("text_classification", SpecArgs{ + Table: "t_txt", Category: "text_classification", Intent: "train", LabelColumn: "label", + }, "texts", true) + + check("masked_language_modeling", SpecArgs{ + Table: "t_mlm", Category: "masked_language_modeling", Intent: "train", + }, "sequences", false) +} diff --git a/internal/push/walk.go b/internal/push/walk.go index bb7b15f..a6bb163 100644 --- a/internal/push/walk.go +++ b/internal/push/walk.go @@ -46,16 +46,46 @@ type LocalLayout struct { // Images is the list of absolute paths to image files under // Root/images/. Order is filesystem-walk order — Discover // doesn't sort, so callers that need determinism (e.g. - // reproducible-build tests) sort before use. + // reproducible-build tests) sort before use. Empty for non-image + // categories (which use Sidecars instead). Images []string + // Sidecars maps a sidecar directory name (e.g. "texts", + // "sequences", "annotations", "masks") to the absolute paths of + // the files in it. Each is staged under "/" — the + // generic counterpart to Images, used by the text family (and, + // later, object detection / segmentation). nil for image_ + // classification (which uses Images) and tabular (no sidecars). + Sidecars map[string][]string + + // ExtraFiles maps a staged destination filename to its absolute + // source path, for single root-level files beyond labels.csv — + // e.g. masked_language_modeling's tokenizer.json, which the + // ingestor reads from SRC_PATH/tokenizer.json. Staged verbatim at + // the table root. + ExtraFiles map[string]string + // TotalBytes is the sum of all files Discover will stage — - // labels.csv plus every entry in Images. Pre-computed during - // the walk so the size-cap check + the progress bar (PR-b) - // can read it without re-stat'ing. + // labels.csv plus every entry in Images / Sidecars / ExtraFiles. + // Pre-computed during the walk so the size-cap check + the + // progress bar can read it without re-stat'ing. TotalBytes int64 } +// FileCount returns the total number of files this layout stages: +// labels.csv, every ExtraFile, and every Images / Sidecars entry. Used +// for the "staging N files" messaging so it's accurate across all +// category families. +func (l *LocalLayout) FileCount() int { + n := 1 // labels.csv + n += len(l.Images) + n += len(l.ExtraFiles) + for _, files := range l.Sidecars { + n += len(files) + } + return n +} + // imageExtensions accepts the file types the chart's // image_classification ingestor processes by default. From // data-ingestors' FileTypeValidator(images) defaults: .jpg, .jpeg,