|
16 | 16 | """ |
17 | 17 | Pydantic models for Search Index Chunking V1 |
18 | 18 | """ |
| 19 | +from enum import Enum |
19 | 20 | from typing import ( |
20 | 21 | Dict, |
21 | 22 | List, |
22 | | - Union |
| 23 | + Union, |
23 | 24 | ) |
24 | 25 |
|
25 | | -from enum import Enum |
26 | | -from pydantic import BaseModel, Field, ConfigDict |
| 26 | +from pydantic import ( |
| 27 | + BaseModel, |
| 28 | + ConfigDict, |
| 29 | + Field, |
| 30 | +) |
27 | 31 |
|
28 | 32 |
|
29 | 33 | class DocumentType(str, Enum): |
30 | 34 | """Document type enumeration""" |
| 35 | + |
31 | 36 | TEXT = "Text" |
| 37 | + TITLE = "Title" |
| 38 | + TABLE = "Table" |
| 39 | + IMAGE = "Image" |
| 40 | + LIST_ITEM = "ListItem" |
| 41 | + CODE_SNIPPET = "CodeSnippet" |
| 42 | + PAGE_METADATA = "PageMetadata" |
| 43 | + |
| 44 | + |
| 45 | +class ChunkType(str, Enum): |
| 46 | + TEXT = "text" |
32 | 47 |
|
33 | 48 |
|
34 | 49 | class SearchIndexChunkingV1PrependField(BaseModel): |
35 | 50 | """Field to prepend to chunk content""" |
| 51 | + |
36 | 52 | dmo_name: str = Field( |
37 | | - default="", |
38 | | - description="Data Model Object name", |
39 | | - examples=["udmo_1__dlm"] |
| 53 | + default="", description="Data Model Object name", examples=["udmo_1__dlm"] |
40 | 54 | ) |
41 | 55 | field_name: str = Field( |
42 | 56 | default="", |
43 | 57 | description="Field name to prepend", |
44 | | - examples=["ResolvedFilePath__c"] |
| 58 | + examples=["ResolvedFilePath__c"], |
45 | 59 | ) |
46 | 60 | value: str = Field( |
47 | 61 | default="", |
48 | 62 | description="Field value to prepend", |
49 | | - examples=["udlo_1__dll:quarterly_report.pdf"] |
| 63 | + examples=["udlo_1__dll:quarterly_report.pdf"], |
50 | 64 | ) |
51 | | - model_config = ConfigDict(extra='ignore') |
| 65 | + model_config = ConfigDict(extra="ignore") |
52 | 66 |
|
53 | 67 |
|
54 | | -class SearchIndexChunkingV1Metadata(BaseModel): |
55 | | - """Metadata for input documents""" |
56 | | - type: DocumentType = Field( |
57 | | - default=DocumentType.TEXT, |
58 | | - description="Document type (Text)", |
59 | | - examples=["Text"] |
60 | | - ) |
61 | | - page_number: int = Field( |
62 | | - default=0, |
63 | | - description="Page number in the source document (0-based)", |
64 | | - examples=[1] |
65 | | - ) |
| 68 | +class SearchIndexChunkingV1TranscriptField(BaseModel): |
| 69 | + """Field to prepend to chunk content""" |
| 70 | + |
66 | 71 | speaker: str = Field( |
67 | 72 | default="", |
68 | 73 | description="Speaker name for audio/video transcripts", |
69 | | - examples=["Narrator"] |
| 74 | + examples=["Agent"], |
70 | 75 | ) |
71 | 76 | start_timestamp: str = Field( |
72 | 77 | default="", |
73 | 78 | description="Start timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff", |
74 | | - examples=["2026-03-25T02:01:24.918000"] |
| 79 | + examples=["2026-03-25T02:01:24.918000"], |
75 | 80 | ) |
76 | 81 | end_timestamp: str = Field( |
77 | 82 | default="", |
78 | 83 | description="End timestamp in ISO8601 format: YYYY-MM-DDTHH:MM:SS.ffffff", |
79 | | - examples=["2026-03-25T02:01:30.500000"] |
| 84 | + examples=["2026-03-25T02:01:30.500000"], |
| 85 | + ) |
| 86 | + model_config = ConfigDict(extra="ignore") |
| 87 | + |
| 88 | + |
| 89 | +class SearchIndexChunkingV1Metadata(BaseModel): |
| 90 | + """Metadata for input documents""" |
| 91 | + |
| 92 | + type: DocumentType = Field( |
| 93 | + default=DocumentType.TEXT, description="Document type (Text)", examples=["Text"] |
| 94 | + ) |
| 95 | + transcript_fields: SearchIndexChunkingV1TranscriptField = Field( |
| 96 | + default_factory=SearchIndexChunkingV1TranscriptField, |
| 97 | + description=( |
| 98 | + "Transcript information. Will only be there in case of audio-video files" |
| 99 | + ), |
| 100 | + ) |
| 101 | + page_number: int = Field( |
| 102 | + default=0, |
| 103 | + description="Page number in the source document (0-based)", |
| 104 | + examples=[1], |
80 | 105 | ) |
81 | 106 | text_as_html: str = Field( |
82 | 107 | default="", |
83 | 108 | description="HTML representation of the document text", |
84 | | - examples=["<p>Online Remittance Instructions</p>"] |
| 109 | + examples=["<p>Online Remittance Instructions</p>"], |
85 | 110 | ) |
86 | 111 | source_dmo_fields: Dict[str, Union[str, int]] = Field( |
87 | 112 | default_factory=dict, |
88 | | - description="Source Data Model Object fields as key-value pairs (values can be string or int)", |
| 113 | + description=( |
| 114 | + "Source Data Model Object fields as key-value pairs " |
| 115 | + "(values can be string or int)" |
| 116 | + ), |
89 | 117 | examples=[ |
90 | 118 | { |
91 | 119 | "FilePath__c": "quarterly_report.pdf", |
92 | 120 | "Size__c": 1377454, |
93 | 121 | "ContentType__c": "pdf", |
94 | | - "LastModified__c": "2026-03-25T02:01:24.918000" |
| 122 | + "LastModified__c": "2026-03-25T02:01:24.918000", |
95 | 123 | } |
96 | | - ] |
| 124 | + ], |
97 | 125 | ) |
98 | 126 | prepend: List[SearchIndexChunkingV1PrependField] = Field( |
99 | | - default_factory=list, |
100 | | - description="List of fields to prepend to each chunk" |
| 127 | + default_factory=list, description="List of fields to prepend to each chunk" |
101 | 128 | ) |
102 | | - model_config = ConfigDict(extra='ignore') |
| 129 | + model_config = ConfigDict(extra="ignore") |
103 | 130 |
|
104 | 131 |
|
105 | 132 | class SearchIndexChunkingV1DocElement(BaseModel): |
106 | 133 | """Document element to be chunked""" |
| 134 | + |
107 | 135 | text: str = Field( |
108 | 136 | default="", |
109 | 137 | description="Text content to be chunked", |
110 | | - examples=["Online Remittance Instructions\n\nTransfer proceeds from the sale of your ESOP/RSUs easily."] |
| 138 | + examples=[ |
| 139 | + ( |
| 140 | + "Online Remittance Instructions\n\n" |
| 141 | + "Transfer proceeds from the sale of your ESOP/RSUs easily." |
| 142 | + ) |
| 143 | + ], |
111 | 144 | ) |
112 | 145 | metadata: SearchIndexChunkingV1Metadata = Field( |
113 | 146 | default_factory=SearchIndexChunkingV1Metadata, |
114 | | - description="Source document metadata" |
| 147 | + description="Source document metadata", |
115 | 148 | ) |
116 | | - model_config = ConfigDict(extra='ignore') |
| 149 | + model_config = ConfigDict(extra="ignore") |
117 | 150 |
|
118 | 151 |
|
119 | 152 | class SearchIndexChunkingV1Output(BaseModel): |
120 | 153 | """Output chunk from the chunking process""" |
| 154 | + |
121 | 155 | text: str = Field( |
122 | 156 | default="", |
123 | 157 | description="Chunk text content", |
124 | | - examples=["Online Remittance Instructions"] |
| 158 | + examples=["Online Remittance Instructions"], |
125 | 159 | ) |
126 | 160 | seq_no: int = Field( |
127 | | - default=0, |
128 | | - description="Sequential chunk number (1-based)", |
129 | | - ge=1, |
130 | | - examples=[1] |
| 161 | + default=0, description="Sequential chunk number (1-based)", ge=1, examples=[1] |
131 | 162 | ) |
132 | 163 | chunk_id: str = Field( |
133 | 164 | default="", |
134 | 165 | description="Unique identifier for this chunk (UUID format)", |
135 | | - examples=["550e8400-e29b-41d4-a716-446655440000"] |
| 166 | + examples=["550e8400-e29b-41d4-a716-446655440000"], |
136 | 167 | ) |
137 | | - chunk_type: str = Field( |
138 | | - default="", |
| 168 | + chunk_type: ChunkType = Field( |
| 169 | + default=ChunkType.TEXT, |
139 | 170 | description="Type of chunk (e.g., 'text')", |
140 | | - examples=["text"] |
| 171 | + examples=["text"], |
141 | 172 | ) |
142 | 173 | citations: Dict[str, str] = Field( |
143 | 174 | default_factory=dict, |
144 | 175 | description="Citation information as key-value pairs", |
145 | | - examples=[{"source": "quarterly_report.pdf"}] |
| 176 | + examples=[{"source": "quarterly_report.pdf"}], |
146 | 177 | ) |
147 | | - metadata: str = Field( |
148 | | - default="", |
149 | | - description="JSON string containing metadata about the chunking output", |
150 | | - examples=['{"page": 1}'] |
151 | | - ) |
152 | | - model_config = ConfigDict(extra='ignore') |
| 178 | + model_config = ConfigDict(extra="ignore") |
153 | 179 |
|
154 | 180 |
|
155 | 181 | class SearchIndexChunkingV1Request(BaseModel): |
156 | 182 | """Request for Search Index Chunking""" |
| 183 | + |
157 | 184 | input: List[SearchIndexChunkingV1DocElement] = Field( |
158 | | - default_factory=list, |
159 | | - description="List of documents to be chunked" |
| 185 | + default_factory=list, description="List of documents to be chunked" |
160 | 186 | ) |
161 | | - model_config = ConfigDict(extra='ignore') |
| 187 | + model_config = ConfigDict(extra="ignore") |
162 | 188 |
|
163 | 189 |
|
164 | 190 | class SearchIndexChunkingV1Response(BaseModel): |
165 | 191 | """Batch response for UDS chunking""" |
| 192 | + |
166 | 193 | output: List[SearchIndexChunkingV1Output] = Field( |
167 | 194 | default_factory=list, description="Flat list of chunks from all docs" |
168 | 195 | ) |
169 | | - model_config = ConfigDict(extra='ignore') |
| 196 | + model_config = ConfigDict(extra="ignore") |
0 commit comments