Skip to content

Commit 2116e5e

Browse files
Removing dependency from chunking example
1 parent 6df64a5 commit 2116e5e

4 files changed

Lines changed: 196 additions & 101 deletions

File tree

.github/workflows/sf_cli_integration.yml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,10 @@ jobs:
259259
run: |
260260
sf data-code-extension function run \
261261
--entrypoint testFunction/payload/entrypoint.py \
262-
--test-with testFunction/payload/tests/test.json \
263-
-o dev1 || {
264-
echo "::error::sf data-code-extension function run FAILED. Check mock server output above; the --entrypoint flag or SF CLI org auth contract may have changed."
265-
exit 1
266-
}
262+
--test-with testFunction/payload/tests/test.json || {
263+
echo "::error::sf data-code-extension function run FAILED. Check mock server output above; the --entrypoint flag or SF CLI org auth contract may have changed."
264+
exit 1
265+
}
267266
268267
# ── Function: deploy ─────────────────────────────────────────────────────
269268

@@ -275,7 +274,6 @@ jobs:
275274
--description "Test function deploy" \
276275
--package-dir testFunction/payload \
277276
--cpu-size CPU_2XL \
278-
--function-invoke-opt UnstructuredChunking \
279277
-o dev1 || {
280278
echo "::error::sf data-code-extension function deploy FAILED. Check mock server output above for which endpoint failed. The deploy command flags or API contract may have changed."
281279
exit 1

src/datacustomcode/function/feature_types/chunking.py

Lines changed: 81 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -16,154 +16,181 @@
1616
"""
1717
Pydantic models for Search Index Chunking V1
1818
"""
19+
from enum import Enum
1920
from typing import (
2021
Dict,
2122
List,
22-
Union
23+
Union,
2324
)
2425

25-
from enum import Enum
26-
from pydantic import BaseModel, Field, ConfigDict
26+
from pydantic import (
27+
BaseModel,
28+
ConfigDict,
29+
Field,
30+
)
2731

2832

2933
class DocumentType(str, Enum):
3034
"""Document type enumeration"""
35+
3136
TEXT = "Text"
37+
TITLE = "Title"
38+
TABLE = "Table"
39+
IMAGE = "Image"
40+
LIST_ITEM = "ListItem"
41+
CODE_SNIPPET = "CodeSnippet"
42+
PAGE_METADATA = "PageMetadata"
43+
44+
45+
class ChunkType(str, Enum):
46+
TEXT = "text"
3247

3348

3449
class SearchIndexChunkingV1PrependField(BaseModel):
3550
"""Field to prepend to chunk content"""
51+
3652
dmo_name: str = Field(
37-
default="",
38-
description="Data Model Object name",
39-
examples=["udmo_1__dlm"]
53+
default="", description="Data Model Object name", examples=["udmo_1__dlm"]
4054
)
4155
field_name: str = Field(
4256
default="",
4357
description="Field name to prepend",
44-
examples=["ResolvedFilePath__c"]
58+
examples=["ResolvedFilePath__c"],
4559
)
4660
value: str = Field(
4761
default="",
4862
description="Field value to prepend",
49-
examples=["udlo_1__dll:quarterly_report.pdf"]
63+
examples=["udlo_1__dll:quarterly_report.pdf"],
5064
)
51-
model_config = ConfigDict(extra='ignore')
65+
model_config = ConfigDict(extra="ignore")
5266

5367

54-
class SearchIndexChunkingV1Metadata(BaseModel):
55-
"""Metadata for input documents"""
56-
type: DocumentType = Field(
57-
default=DocumentType.TEXT,
58-
description="Document type (Text)",
59-
examples=["Text"]
60-
)
61-
page_number: int = Field(
62-
default=0,
63-
description="Page number in the source document (0-based)",
64-
examples=[1]
65-
)
68+
class SearchIndexChunkingV1TranscriptField(BaseModel):
69+
"""Field to prepend to chunk content"""
70+
6671
speaker: str = Field(
6772
default="",
6873
description="Speaker name for audio/video transcripts",
69-
examples=["Narrator"]
74+
examples=["Agent"],
7075
)
7176
start_timestamp: str = Field(
7277
default="",
7378
description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
74-
examples=["2026-03-25T02:01:24.918000"]
79+
examples=["2026-03-25T02:01:24.918000"],
7580
)
7681
end_timestamp: str = Field(
7782
default="",
7883
description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
79-
examples=["2026-03-25T02:01:30.500000"]
84+
examples=["2026-03-25T02:01:30.500000"],
85+
)
86+
model_config = ConfigDict(extra="ignore")
87+
88+
89+
class SearchIndexChunkingV1Metadata(BaseModel):
90+
"""Metadata for input documents"""
91+
92+
type: DocumentType = Field(
93+
default=DocumentType.TEXT, description="Document type (Text)", examples=["Text"]
94+
)
95+
transcript_fields: SearchIndexChunkingV1TranscriptField = Field(
96+
default_factory=SearchIndexChunkingV1TranscriptField,
97+
description=(
98+
"Transcript information. Will only be there in case of audio-video files"
99+
),
100+
)
101+
page_number: int = Field(
102+
default=0,
103+
description="Page number in the source document (0-based)",
104+
examples=[1],
80105
)
81106
text_as_html: str = Field(
82107
default="",
83108
description="HTML representation of the document text",
84-
examples=["<p>Online Remittance Instructions</p>"]
109+
examples=["<p>Online Remittance Instructions</p>"],
85110
)
86111
source_dmo_fields: Dict[str, Union[str, int]] = Field(
87112
default_factory=dict,
88-
description="Source Data Model Object fields as key-value pairs (values can be string or int)",
113+
description=(
114+
"Source Data Model Object fields as key-value pairs "
115+
"(values can be string or int)"
116+
),
89117
examples=[
90118
{
91119
"FilePath__c": "quarterly_report.pdf",
92120
"Size__c": 1377454,
93121
"ContentType__c": "pdf",
94-
"LastModified__c": "2026-03-25T02:01:24.918000"
122+
"LastModified__c": "2026-03-25T02:01:24.918000",
95123
}
96-
]
124+
],
97125
)
98126
prepend: List[SearchIndexChunkingV1PrependField] = Field(
99-
default_factory=list,
100-
description="List of fields to prepend to each chunk"
127+
default_factory=list, description="List of fields to prepend to each chunk"
101128
)
102-
model_config = ConfigDict(extra='ignore')
129+
model_config = ConfigDict(extra="ignore")
103130

104131

105132
class SearchIndexChunkingV1DocElement(BaseModel):
106133
"""Document element to be chunked"""
134+
107135
text: str = Field(
108136
default="",
109137
description="Text content to be chunked",
110-
examples=["Online Remittance Instructions\n\nTransfer proceeds from the sale of your ESOP/RSUs easily."]
138+
examples=[
139+
(
140+
"Online Remittance Instructions\n\n"
141+
"Transfer proceeds from the sale of your ESOP/RSUs easily."
142+
)
143+
],
111144
)
112145
metadata: SearchIndexChunkingV1Metadata = Field(
113146
default_factory=SearchIndexChunkingV1Metadata,
114-
description="Source document metadata"
147+
description="Source document metadata",
115148
)
116-
model_config = ConfigDict(extra='ignore')
149+
model_config = ConfigDict(extra="ignore")
117150

118151

119152
class SearchIndexChunkingV1Output(BaseModel):
120153
"""Output chunk from the chunking process"""
154+
121155
text: str = Field(
122156
default="",
123157
description="Chunk text content",
124-
examples=["Online Remittance Instructions"]
158+
examples=["Online Remittance Instructions"],
125159
)
126160
seq_no: int = Field(
127-
default=0,
128-
description="Sequential chunk number (1-based)",
129-
ge=1,
130-
examples=[1]
161+
default=0, description="Sequential chunk number (1-based)", ge=1, examples=[1]
131162
)
132163
chunk_id: str = Field(
133164
default="",
134165
description="Unique identifier for this chunk (UUID format)",
135-
examples=["550e8400-e29b-41d4-a716-446655440000"]
166+
examples=["550e8400-e29b-41d4-a716-446655440000"],
136167
)
137-
chunk_type: str = Field(
138-
default="",
168+
chunk_type: ChunkType = Field(
169+
default=ChunkType.TEXT,
139170
description="Type of chunk (e.g., 'text')",
140-
examples=["text"]
171+
examples=["text"],
141172
)
142173
citations: Dict[str, str] = Field(
143174
default_factory=dict,
144175
description="Citation information as key-value pairs",
145-
examples=[{"source": "quarterly_report.pdf"}]
176+
examples=[{"source": "quarterly_report.pdf"}],
146177
)
147-
metadata: str = Field(
148-
default="",
149-
description="JSON string containing metadata about the chunking output",
150-
examples=['{"page": 1}']
151-
)
152-
model_config = ConfigDict(extra='ignore')
178+
model_config = ConfigDict(extra="ignore")
153179

154180

155181
class SearchIndexChunkingV1Request(BaseModel):
156182
"""Request for Search Index Chunking"""
183+
157184
input: List[SearchIndexChunkingV1DocElement] = Field(
158-
default_factory=list,
159-
description="List of documents to be chunked"
185+
default_factory=list, description="List of documents to be chunked"
160186
)
161-
model_config = ConfigDict(extra='ignore')
187+
model_config = ConfigDict(extra="ignore")
162188

163189

164190
class SearchIndexChunkingV1Response(BaseModel):
165191
"""Batch response for UDS chunking"""
192+
166193
output: List[SearchIndexChunkingV1Output] = Field(
167194
default_factory=list, description="Flat list of chunks from all docs"
168195
)
169-
model_config = ConfigDict(extra='ignore')
196+
model_config = ConfigDict(extra="ignore")

0 commit comments

Comments
 (0)