Skip to content

Commit 116d2e1

Browse files
committed
refactor(documents): refactor import method for improved efficiency
- Split import_ into smaller, focused methods - Optimize batch import logic - Update test cases to reflect new implementation
1 parent 9284f1e commit 116d2e1

2 files changed

Lines changed: 89 additions & 74 deletions

File tree

src/typesense/documents.py

Lines changed: 74 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -301,83 +301,23 @@ def import_(
301301
It can handle both individual documents and batches of documents.
302302
303303
Args:
304-
documents (Union[bytes, str, List[TDoc]]): The documents to import.
305-
306-
import_parameters (Union[DocumentImportParameters, None], optional):
307-
Parameters for the import operation.
308-
309-
batch_size (Union[int, None], optional): The size of each batch for batch imports.
304+
documents: The documents to import.
305+
import_parameters: Parameters for the import operation.
306+
batch_size: The size of each batch for batch imports.
310307
311308
Returns:
312-
(ImportResponse[TDoc] | str):
313-
The import response, which can be a list of responses or a string.
309+
The import response, which can be a list of responses or a string.
314310
315311
Raises:
316312
TypesenseClientError: If an empty list of documents is provided.
317313
"""
318-
if not isinstance(documents, (str, bytes)):
319-
if batch_size:
320-
response_objs: ImportResponse[TDoc] = []
321-
batch: typing.List[TDoc] = []
322-
for document in documents:
323-
batch.append(document)
324-
if len(batch) == batch_size:
325-
api_response = self.import_(
326-
documents=batch,
327-
import_parameters=import_parameters,
328-
)
329-
response_objs.extend(api_response)
330-
batch = []
331-
if batch:
332-
api_response = self.import_(batch, import_parameters)
333-
response_objs.extend(api_response)
334-
335-
else:
336-
document_strs: typing.List[str] = []
337-
for document in documents:
338-
document_strs.append(json.dumps(document))
339-
340-
if len(document_strs) == 0:
341-
raise TypesenseClientError(
342-
"Cannot import an empty list of documents.",
343-
)
344-
345-
docs_import = "\n".join(document_strs)
346-
res = self.api_call.post(
347-
self._endpoint_path("import"),
348-
body=docs_import,
349-
params=import_parameters,
350-
entity_type=str,
351-
as_json=False,
352-
)
353-
res_obj_strs = res.split("\n")
354-
355-
response_objs = []
356-
for res_obj_str in res_obj_strs:
357-
try:
358-
res_obj_json: typing.Union[
359-
ImportResponseWithDocAndId[TDoc],
360-
ImportResponseWithDoc[TDoc],
361-
ImportResponseWithId,
362-
ImportResponseSuccess,
363-
ImportResponseFail[TDoc],
364-
] = json.loads(res_obj_str)
365-
except json.JSONDecodeError as decode_error:
366-
raise TypesenseClientError(
367-
f"Invalid response - {res_obj_str}",
368-
) from decode_error
369-
response_objs.append(res_obj_json)
370-
371-
return response_objs
372-
else:
373-
api_response = self.api_call.post(
374-
self._endpoint_path("import"),
375-
body=documents,
376-
params=import_parameters,
377-
as_json=False,
378-
entity_type=str,
379-
)
380-
return api_response
314+
if isinstance(documents, (str, bytes)):
315+
return self._import_raw(documents, import_parameters)
316+
317+
if batch_size:
318+
return self._batch_import(documents, import_parameters, batch_size)
319+
320+
return self._bulk_import(documents, import_parameters)
381321

382322
def export(
383323
self,
@@ -462,3 +402,66 @@ def _endpoint_path(self, action: typing.Union[str, None] = None) -> str:
462402
action,
463403
],
464404
)
405+
406+
def _import_raw(
407+
self,
408+
documents: typing.Union[bytes, str],
409+
import_parameters: _ImportParameters,
410+
) -> str:
411+
"""Import raw document data."""
412+
response: str = self.api_call.post(
413+
self._endpoint_path("import"),
414+
body=documents,
415+
params=import_parameters,
416+
as_json=False,
417+
entity_type=str,
418+
)
419+
420+
return response
421+
422+
def _batch_import(
423+
self,
424+
documents: typing.List[TDoc],
425+
import_parameters: _ImportParameters,
426+
batch_size: int,
427+
) -> ImportResponse[TDoc]:
428+
"""Import documents in batches."""
429+
response_objs: ImportResponse[TDoc] = []
430+
for batch_index in range(0, len(documents), batch_size):
431+
batch = documents[batch_index : batch_index + batch_size]
432+
api_response = self._bulk_import(batch, import_parameters)
433+
response_objs.extend(api_response)
434+
return response_objs
435+
436+
def _bulk_import(
437+
self,
438+
documents: typing.List[TDoc],
439+
import_parameters: _ImportParameters,
440+
) -> ImportResponse[TDoc]:
441+
"""Import a list of documents in bulk."""
442+
document_strs = [json.dumps(doc) for doc in documents]
443+
if not document_strs:
444+
raise TypesenseClientError("Cannot import an empty list of documents.")
445+
446+
docs_import = "\n".join(document_strs)
447+
res = self.api_call.post(
448+
self._endpoint_path("import"),
449+
body=docs_import,
450+
params=import_parameters,
451+
entity_type=str,
452+
as_json=False,
453+
)
454+
return self._parse_import_response(res)
455+
456+
def _parse_import_response(self, response: str) -> ImportResponse[TDoc]:
457+
"""Parse the import response string into a list of response objects."""
458+
response_objs: typing.List[ImportResponse] = []
459+
for res_obj_str in response.split("\n"):
460+
try:
461+
res_obj_json = json.loads(res_obj_str)
462+
except json.JSONDecodeError as decode_error:
463+
raise TypesenseClientError(
464+
f"Invalid response - {res_obj_str}",
465+
) from decode_error
466+
response_objs.append(res_obj_json)
467+
return response_objs

tests/documents_test.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def test_import_fail(
223223
"""Test that the Documents object doesn't throw an error when importing documents."""
224224
wrong_company: Companies = {"company_name": "Wrong", "id": "0", "num_employees": 0}
225225
companies = generate_companies + [wrong_company]
226-
request_spy = mocker.spy(actual_documents, "import_")
226+
request_spy = mocker.spy(actual_documents, "_bulk_import")
227227
response = actual_documents.import_(companies)
228228

229229
expected: typing.List[typing.Dict[str, typing.Union[str, bool, int]]] = [
@@ -280,51 +280,62 @@ def test_import_batch_size(
280280
) -> None:
281281
"""Test that the Documents object can import documents in batches."""
282282
batch_size = 5
283-
document_spy = mocker.spy(actual_documents, "import_")
283+
import_spy = mocker.spy(actual_documents, "import_")
284+
batch_import_spy = mocker.spy(actual_documents, "_bulk_import")
284285
request_spy = mocker.spy(actual_api_call, "post")
285286
response = actual_documents.import_(generate_companies, batch_size=batch_size)
286287

287288
expected = [{"success": True} for _ in generate_companies]
288-
assert document_spy.call_count == len(generate_companies) // batch_size + 1
289+
assert import_spy.call_count == 1
290+
assert batch_import_spy.call_count == len(generate_companies) // batch_size
289291
assert request_spy.call_count == len(generate_companies) // batch_size
290292
assert response == expected
291293

292294

293295
def test_import_return_docs(
294296
generate_companies: typing.List[Companies],
295297
actual_documents: Documents[Companies],
298+
mocker: MockFixture,
296299
delete_all: None,
297300
create_collection: None,
298301
) -> None:
299302
"""Test that the Documents object can return documents when importing."""
303+
request_spy = mocker.spy(actual_documents, "_bulk_import")
300304
response = actual_documents.import_(generate_companies, {"return_doc": True})
301305
expected = [
302306
{"success": True, "document": company} for company in generate_companies
303307
]
308+
309+
assert request_spy.call_count == 1
304310
assert response == expected
305311

306312

307313
def test_import_return_ids(
308314
generate_companies: typing.List[Companies],
309315
actual_documents: Documents[Companies],
316+
mocker: MockFixture,
310317
delete_all: None,
311318
create_collection: None,
312319
) -> None:
313320
"""Test that the Documents object can return document IDs when importing."""
321+
request_spy = mocker.spy(actual_documents, "_bulk_import")
314322
response = actual_documents.import_(generate_companies, {"return_id": True})
315323
expected = [
316324
{"success": True, "id": company.get("id")} for company in generate_companies
317325
]
326+
assert request_spy.call_count == 1
318327
assert response == expected
319328

320329

321330
def test_import_return_ids_and_docs(
322331
generate_companies: typing.List[Companies],
323332
actual_documents: Documents[Companies],
333+
mocker: MockFixture,
324334
delete_all: None,
325335
create_collection: None,
326336
) -> None:
327337
"""Test that the Documents object can return document IDs and documents when importing."""
338+
request_spy = mocker.spy(actual_documents, "_bulk_import")
328339
response = actual_documents.import_(
329340
generate_companies,
330341
{"return_id": True, "return_doc": True},
@@ -333,6 +344,7 @@ def test_import_return_ids_and_docs(
333344
{"success": True, "document": company, "id": company.get("id")}
334345
for company in generate_companies
335346
]
347+
assert request_spy.call_count == 1
336348
assert response == expected
337349

338350

0 commit comments

Comments
 (0)