Skip to content

Commit 6df64a5

Browse files
SearchIndexChunking contract
1 parent e000a03 commit 6df64a5

2 files changed

Lines changed: 129 additions & 41 deletions

File tree

src/datacustomcode/function/feature_types/chunking.py

Lines changed: 125 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -14,71 +14,156 @@
1414
# limitations under the License.
1515

1616
"""
17-
Pydantic models for byoc-function-proto (uds_chunking.proto)
18-
Auto-generated - validation rules from buf.validate
17+
Pydantic models for Search Index Chunking V1
1918
"""
20-
2119
from typing import (
22-
Any,
2320
Dict,
2421
List,
22+
Union
2523
)
2624

27-
from pydantic import BaseModel, Field
25+
from enum import Enum
26+
from pydantic import BaseModel, Field, ConfigDict
2827

2928

30-
class SearchIndexDocElement(BaseModel):
31-
"""Document element to be chunked"""
29+
class DocumentType(str, Enum):
30+
"""Document type enumeration"""
31+
TEXT = "Text"
3232

33-
text: str = Field(..., description="Text content to be chunked")
34-
metadata: Dict[str, Any] = Field(
35-
default_factory=dict, description="Source document metadata"
36-
)
3733

34+
class SearchIndexChunkingV1PrependField(BaseModel):
35+
"""Field to prepend to chunk content"""
36+
dmo_name: str = Field(
37+
default="",
38+
description="Data Model Object name",
39+
examples=["udmo_1__dlm"]
40+
)
41+
field_name: str = Field(
42+
default="",
43+
description="Field name to prepend",
44+
examples=["ResolvedFilePath__c"]
45+
)
46+
value: str = Field(
47+
default="",
48+
description="Field value to prepend",
49+
examples=["udlo_1__dll:quarterly_report.pdf"]
50+
)
51+
model_config = ConfigDict(extra='ignore')
3852

39-
class SearchIndexChunkOutput(BaseModel):
40-
"""Output chunk from the chunking process"""
4153

42-
chunk_id: str = Field(..., description="UUID for this chunk")
43-
chunk_type: str = Field(..., description="Type: 'text'")
44-
text: str = Field(..., description="Chunk text content")
45-
seq_no: int = Field(..., description="Sequential chunk number (1-based)")
46-
metadata: Dict[str, str] = Field(
47-
default_factory=dict, description="Metadata from source (DMO fields)"
54+
class SearchIndexChunkingV1Metadata(BaseModel):
55+
"""Metadata for input documents"""
56+
type: DocumentType = Field(
57+
default=DocumentType.TEXT,
58+
description="Document type (Text)",
59+
examples=["Text"]
4860
)
49-
tag_metadata: Dict[str, Any] = Field(
50-
default_factory=dict, description="Additional tags"
61+
page_number: int = Field(
62+
default=0,
63+
description="Page number in the source document (0-based)",
64+
examples=[1]
5165
)
52-
citations: Dict[str, Any] = Field(
53-
default_factory=dict, description="Citation information"
66+
speaker: str = Field(
67+
default="",
68+
description="Speaker name for audio/video transcripts",
69+
examples=["Narrator"]
5470
)
71+
start_timestamp: str = Field(
72+
default="",
73+
description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
74+
examples=["2026-03-25T02:01:24.918000"]
75+
)
76+
end_timestamp: str = Field(
77+
default="",
78+
description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
79+
examples=["2026-03-25T02:01:30.500000"]
80+
)
81+
text_as_html: str = Field(
82+
default="",
83+
description="HTML representation of the document text",
84+
examples=["<p>Online Remittance Instructions</p>"]
85+
)
86+
source_dmo_fields: Dict[str, Union[str, int]] = Field(
87+
default_factory=dict,
88+
description="Source Data Model Object fields as key-value pairs (values can be string or int)",
89+
examples=[
90+
{
91+
"FilePath__c": "quarterly_report.pdf",
92+
"Size__c": 1377454,
93+
"ContentType__c": "pdf",
94+
"LastModified__c": "2026-03-25T02:01:24.918000"
95+
}
96+
]
97+
)
98+
prepend: List[SearchIndexChunkingV1PrependField] = Field(
99+
default_factory=list,
100+
description="List of fields to prepend to each chunk"
101+
)
102+
model_config = ConfigDict(extra='ignore')
55103

56104

57-
class SearchIndexStatusResponse(BaseModel):
58-
"""Status response for operation"""
105+
class SearchIndexChunkingV1DocElement(BaseModel):
106+
"""Document element to be chunked"""
107+
text: str = Field(
108+
default="",
109+
description="Text content to be chunked",
110+
examples=["Online Remittance Instructions\n\nTransfer proceeds from the sale of your ESOP/RSUs easily."]
111+
)
112+
metadata: SearchIndexChunkingV1Metadata = Field(
113+
default_factory=SearchIndexChunkingV1Metadata,
114+
description="Source document metadata"
115+
)
116+
model_config = ConfigDict(extra='ignore')
59117

60-
status_type: str = Field(..., description="'success' or 'error'")
61-
status_message: str = Field(..., description="Human-readable status")
62118

119+
class SearchIndexChunkingV1Output(BaseModel):
120+
"""Output chunk from the chunking process"""
121+
text: str = Field(
122+
default="",
123+
description="Chunk text content",
124+
examples=["Online Remittance Instructions"]
125+
)
126+
seq_no: int = Field(
127+
default=0,
128+
description="Sequential chunk number (1-based)",
129+
ge=1,
130+
examples=[1]
131+
)
132+
chunk_id: str = Field(
133+
default="",
134+
description="Unique identifier for this chunk (UUID format)",
135+
examples=["550e8400-e29b-41d4-a716-446655440000"]
136+
)
137+
chunk_type: str = Field(
138+
default="",
139+
description="Type of chunk (e.g., 'text')",
140+
examples=["text"]
141+
)
142+
citations: Dict[str, str] = Field(
143+
default_factory=dict,
144+
description="Citation information as key-value pairs",
145+
examples=[{"source": "quarterly_report.pdf"}]
146+
)
147+
metadata: str = Field(
148+
default="",
149+
description="JSON string containing metadata about the chunking output",
150+
examples=['{"page": 1}']
151+
)
152+
model_config = ConfigDict(extra='ignore')
63153

64-
class SearchIndexChunkingV1Request(BaseModel):
65-
"""Batch request for UDS chunking"""
66154

67-
input: List[SearchIndexDocElement] = Field(
68-
..., min_length=1, description="List of documents (min 1)"
69-
)
70-
max_characters: int = Field(..., description="Max chars per chunk (default: 100)")
71-
additional_params: Dict[str, Any] = Field(
72-
default_factory=dict, description="Future extension point"
155+
class SearchIndexChunkingV1Request(BaseModel):
156+
"""Request for Search Index Chunking"""
157+
input: List[SearchIndexChunkingV1DocElement] = Field(
158+
default_factory=list,
159+
description="List of documents to be chunked"
73160
)
161+
model_config = ConfigDict(extra='ignore')
74162

75163

76164
class SearchIndexChunkingV1Response(BaseModel):
77165
"""Batch response for UDS chunking"""
78-
79-
output: List[SearchIndexChunkOutput] = Field(
166+
output: List[SearchIndexChunkingV1Output] = Field(
80167
default_factory=list, description="Flat list of chunks from all docs"
81168
)
82-
status: SearchIndexStatusResponse = Field(
83-
..., description="Overall operation status"
84-
)
169+
model_config = ConfigDict(extra='ignore')

src/datacustomcode/function_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -275,8 +275,11 @@ def _generate_model_sample_data(model_type):
275275

276276
sample_data = {}
277277
for field_name, field_info in model_type.model_fields.items():
278+
# Use examples if available
279+
if field_info.examples and len(field_info.examples) > 0:
280+
sample_data[field_name] = field_info.examples[0]
278281
# Check if field has a real default value
279-
if field_info.default is not PydanticUndefined:
282+
elif field_info.default is not PydanticUndefined:
280283
sample_data[field_name] = field_info.default
281284
else:
282285
# Required field or field without default - generate sample

0 commit comments

Comments
 (0)