SearchIndexChunking contract

ritaagarwala-sf · ritaagarwala-sf · commit 6df64a5b2413 · 2026-05-01T18:49:23.000+05:30
diff --git a/src/datacustomcode/function/feature_types/chunking.py b/src/datacustomcode/function/feature_types/chunking.py
@@ -14,71 +14,156 @@
 # limitations under the License.
 
 """
-Pydantic models for byoc-function-proto (uds_chunking.proto)
-Auto-generated - validation rules from buf.validate
+Pydantic models for Search Index Chunking V1
 """
-
 from typing import (
-    Any,
     Dict,
     List,
+    Union
 )
 
-from pydantic import BaseModel, Field
+from enum import Enum
+from pydantic import BaseModel, Field, ConfigDict
 
 
-class SearchIndexDocElement(BaseModel):
-    """Document element to be chunked"""
+class DocumentType(str, Enum):
+    """Document type enumeration"""
+    TEXT = "Text"
 
-    text: str = Field(..., description="Text content to be chunked")
-    metadata: Dict[str, Any] = Field(
-        default_factory=dict, description="Source document metadata"
-    )
 
+class SearchIndexChunkingV1PrependField(BaseModel):
+    """Field to prepend to chunk content"""
+    dmo_name: str = Field(
+        default="",
+        description="Data Model Object name",
+        examples=["udmo_1__dlm"]
+    )
+    field_name: str = Field(
+        default="",
+        description="Field name to prepend",
+        examples=["ResolvedFilePath__c"]
+    )
+    value: str = Field(
+        default="",
+        description="Field value to prepend",
+        examples=["udlo_1__dll:quarterly_report.pdf"]
+    )
+    model_config = ConfigDict(extra='ignore')
 
-class SearchIndexChunkOutput(BaseModel):
-    """Output chunk from the chunking process"""
 
-    chunk_id: str = Field(..., description="UUID for this chunk")
-    chunk_type: str = Field(..., description="Type: 'text'")
-    text: str = Field(..., description="Chunk text content")
-    seq_no: int = Field(..., description="Sequential chunk number (1-based)")
-    metadata: Dict[str, str] = Field(
-        default_factory=dict, description="Metadata from source (DMO fields)"
+class SearchIndexChunkingV1Metadata(BaseModel):
+    """Metadata for input documents"""
+    type: DocumentType = Field(
+        default=DocumentType.TEXT,
+        description="Document type (Text)",
+        examples=["Text"]
     )
-    tag_metadata: Dict[str, Any] = Field(
-        default_factory=dict, description="Additional tags"
+    page_number: int = Field(
+        default=0,
+        description="Page number in the source document (0-based)",
+        examples=[1]
     )
-    citations: Dict[str, Any] = Field(
-        default_factory=dict, description="Citation information"
+    speaker: str = Field(
+        default="",
+        description="Speaker name for audio/video transcripts",
+        examples=["Narrator"]
     )
+    start_timestamp: str = Field(
+        default="",
+        description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
+        examples=["2026-03-25T02:01:24.918000"]
+    )
+    end_timestamp: str = Field(
+        default="",
+        description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff",
+        examples=["2026-03-25T02:01:30.500000"]
+    )
+    text_as_html: str = Field(
+        default="",
+        description="HTML representation of the document text",
+        examples=["<p>Online Remittance Instructions</p>"]
+    )
+    source_dmo_fields: Dict[str, Union[str, int]] = Field(
+        default_factory=dict,
+        description="Source Data Model Object fields as key-value pairs (values can be string or int)",
+        examples=[
+            {
+                "FilePath__c": "quarterly_report.pdf",
+                "Size__c": 1377454,
+                "ContentType__c": "pdf",
+                "LastModified__c": "2026-03-25T02:01:24.918000"
+            }
+        ]
+    )
+    prepend: List[SearchIndexChunkingV1PrependField] = Field(
+        default_factory=list,
+        description="List of fields to prepend to each chunk"
+    )
+    model_config = ConfigDict(extra='ignore')
 
 
-class SearchIndexStatusResponse(BaseModel):
-    """Status response for operation"""
+class SearchIndexChunkingV1DocElement(BaseModel):
+    """Document element to be chunked"""
+    text: str = Field(
+        default="",
+        description="Text content to be chunked",
+        examples=["Online Remittance Instructions\n\nTransfer proceeds from the sale of your ESOP/RSUs easily."]
+    )
+    metadata: SearchIndexChunkingV1Metadata = Field(
+        default_factory=SearchIndexChunkingV1Metadata,
+        description="Source document metadata"
+    )
+    model_config = ConfigDict(extra='ignore')
 
-    status_type: str = Field(..., description="'success' or 'error'")
-    status_message: str = Field(..., description="Human-readable status")
 
+class SearchIndexChunkingV1Output(BaseModel):
+    """Output chunk from the chunking process"""
+    text: str = Field(
+        default="",
+        description="Chunk text content",
+        examples=["Online Remittance Instructions"]
+    )
+    seq_no: int = Field(
+        default=0,
+        description="Sequential chunk number (1-based)",
+        ge=1,
+        examples=[1]
+    )
+    chunk_id: str = Field(
+        default="",
+        description="Unique identifier for this chunk (UUID format)",
+        examples=["550e8400-e29b-41d4-a716-446655440000"]
+    )
+    chunk_type: str = Field(
+        default="",
+        description="Type of chunk (e.g., 'text')",
+        examples=["text"]
+    )
+    citations: Dict[str, str] = Field(
+        default_factory=dict,
+        description="Citation information as key-value pairs",
+        examples=[{"source": "quarterly_report.pdf"}]
+    )
+    metadata: str = Field(
+        default="",
+        description="JSON string containing metadata about the chunking output",
+        examples=['{"page": 1}']
+    )
+    model_config = ConfigDict(extra='ignore')
 
-class SearchIndexChunkingV1Request(BaseModel):
-    """Batch request for UDS chunking"""
 
-    input: List[SearchIndexDocElement] = Field(
-        ..., min_length=1, description="List of documents (min 1)"
-    )
-    max_characters: int = Field(..., description="Max chars per chunk (default: 100)")
-    additional_params: Dict[str, Any] = Field(
-        default_factory=dict, description="Future extension point"
+class SearchIndexChunkingV1Request(BaseModel):
+    """Request for Search Index Chunking"""
+    input: List[SearchIndexChunkingV1DocElement] = Field(
+        default_factory=list,
+        description="List of documents to be chunked"
     )
+    model_config = ConfigDict(extra='ignore')
 
 
 class SearchIndexChunkingV1Response(BaseModel):
     """Batch response for UDS chunking"""
-
-    output: List[SearchIndexChunkOutput] = Field(
+    output: List[SearchIndexChunkingV1Output] = Field(
         default_factory=list, description="Flat list of chunks from all docs"
     )
-    status: SearchIndexStatusResponse = Field(
-        ..., description="Overall operation status"
-    )
+    model_config = ConfigDict(extra='ignore')
diff --git a/src/datacustomcode/function_utils.py b/src/datacustomcode/function_utils.py
@@ -275,8 +275,11 @@ def _generate_model_sample_data(model_type):
 
     sample_data = {}
     for field_name, field_info in model_type.model_fields.items():
+        # Use examples if available
+        if field_info.examples and len(field_info.examples) > 0:
+            sample_data[field_name] = field_info.examples[0]
         # Check if field has a real default value
-        if field_info.default is not PydanticUndefined:
+        elif field_info.default is not PydanticUndefined:
             sample_data[field_name] = field_info.default
         else:
             # Required field or field without default - generate sample