default to no row limit inside data cloud

zach-maddox · zach-maddox · commit d481ccc91fba · 2026-04-13T14:52:48.000-04:00
diff --git a/src/datacustomcode/client.py b/src/datacustomcode/client.py
@@ -185,25 +185,33 @@ def _new_function_client(cls) -> Client:
         )
         return cls._instance
 
-    def read_dlo(self, name: str, row_limit: int = 1000) -> PySparkDataFrame:
+    def read_dlo(
+        self, name: str, row_limit: Optional[int] = None
+    ) -> PySparkDataFrame:
         """Read a DLO from Data Cloud.
 
         Args:
             name: The name of the DLO to read.
-            row_limit: Maximum number of rows to fetch (default: 1000).
+            row_limit: Maximum number of rows to fetch. When ``None``, the
+                reader's configured ``default_row_limit`` is used (1000 for
+                local development, no limit when deployed).
 
         Returns:
             A PySpark DataFrame containing the DLO data.
         """
         self._record_dlo_access(name)
         return self._reader.read_dlo(name, row_limit=row_limit)
 
-    def read_dmo(self, name: str, row_limit: int = 1000) -> PySparkDataFrame:
+    def read_dmo(
+        self, name: str, row_limit: Optional[int] = None
+    ) -> PySparkDataFrame:
         """Read a DMO from Data Cloud.
 
         Args:
             name: The name of the DMO to read.
-            row_limit: Maximum number of rows to fetch (default: 1000).
+            row_limit: Maximum number of rows to fetch. When ``None``, the
+                reader's configured ``default_row_limit`` is used (1000 for
+                local development, no limit when deployed).
 
         Returns:
             A PySpark DataFrame containing the DMO data.
diff --git a/src/datacustomcode/config.yaml b/src/datacustomcode/config.yaml
@@ -2,6 +2,7 @@ reader_config:
   type_config_name: QueryAPIDataCloudReader
   options:
     credentials_profile: default
+    default_row_limit: 1000
 
 writer_config:
   type_config_name: PrintDataCloudWriter
diff --git a/src/datacustomcode/io/reader/base.py b/src/datacustomcode/io/reader/base.py
@@ -15,7 +15,7 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 from datacustomcode.io.base import BaseDataAccessLayer
 
@@ -33,13 +33,13 @@ def read_dlo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
+        row_limit: Optional[int] = None,
     ) -> PySparkDataFrame: ...
 
     @abstractmethod
     def read_dmo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
+        row_limit: Optional[int] = None,
     ) -> PySparkDataFrame: ...
diff --git a/src/datacustomcode/io/reader/query_api.py b/src/datacustomcode/io/reader/query_api.py
@@ -37,6 +37,7 @@
 
 
 SQL_QUERY_TEMPLATE: Final = "SELECT * FROM {} LIMIT {}"
+SQL_QUERY_TEMPLATE_NO_LIMIT: Final = "SELECT * FROM {}"
 
 
 def create_cdp_connection(
@@ -122,6 +123,7 @@ def __init__(
         credentials_profile: str = "default",
         dataspace: Optional[str] = None,
         sf_cli_org: Optional[str] = None,
+        default_row_limit: Optional[int] = None,
     ) -> None:
         """Initialize QueryAPIDataCloudReader.
 
@@ -137,8 +139,12 @@ def __init__(
                 reader delegates to :class:`SFCLIDataCloudReader` which calls
                 the Data Cloud REST API directly using the token obtained from
                 ``sf org display``, bypassing the CDP token-exchange flow.
+            default_row_limit: Default maximum number of rows to fetch when
+                ``row_limit`` is not explicitly passed to read methods. When
+                ``None``, no limit is applied (all rows are returned).
         """
         self.spark = spark
+        self._default_row_limit = default_row_limit
         if sf_cli_org:
             logger.debug(
                 f"Initializing QueryAPIDataCloudReader with SF CLI org '{sf_cli_org}'"
@@ -147,6 +153,7 @@ def __init__(
                 spark=spark,
                 sf_cli_org=sf_cli_org,
                 dataspace=dataspace,
+                default_row_limit=default_row_limit,
             )
             self._conn = None
         else:
@@ -158,19 +165,37 @@ def __init__(
             )
             self._conn = create_cdp_connection(credentials, dataspace)
 
+    def _build_query(self, name: str, row_limit: Optional[int]) -> str:
+        """Build a SQL query, applying the default row limit when needed.
+
+        Args:
+            name: Object name to query.
+            row_limit: Explicit row limit, or ``None`` to use the configured default.
+
+        Returns:
+            SQL query string.
+        """
+        effective_limit = (
+            row_limit if row_limit is not None else self._default_row_limit
+        )
+        if effective_limit is not None:
+            return SQL_QUERY_TEMPLATE.format(name, effective_limit)
+        return SQL_QUERY_TEMPLATE_NO_LIMIT.format(name)
+
     def read_dlo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
+        row_limit: Optional[int] = None,
     ) -> PySparkDataFrame:
         """
-        Read a Data Lake Object (DLO) from the Data Cloud, limited to a number of rows.
+        Read a Data Lake Object (DLO) from the Data Cloud.
 
         Args:
             name (str): The name of the DLO.
             schema (Optional[Union[AtomicType, StructType, str]]): Schema of the DLO.
-            row_limit (int): Maximum number of rows to fetch.
+            row_limit (Optional[int]): Maximum number of rows to fetch.
+                When ``None``, the configured ``default_row_limit`` is used.
 
         Returns:
             PySparkDataFrame: The PySpark DataFrame.
@@ -181,7 +206,7 @@ def read_dlo(
         if sf_cli_reader is not None:
             return sf_cli_reader.read_dlo(name, schema, row_limit)
 
-        query = SQL_QUERY_TEMPLATE.format(name, row_limit)
+        query = self._build_query(name, row_limit)
 
         assert self._conn is not None
         pandas_df = self._conn.get_pandas_dataframe(query)
@@ -197,15 +222,16 @@ def read_dmo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
+        row_limit: Optional[int] = None,
     ) -> PySparkDataFrame:
         """
-        Read a Data Model Object (DMO) from the Data Cloud, limited to a number of rows.
+        Read a Data Model Object (DMO) from the Data Cloud.
 
         Args:
             name (str): The name of the DMO.
             schema (Optional[Union[AtomicType, StructType, str]]): Schema of the DMO.
-            row_limit (int): Maximum number of rows to fetch.
+            row_limit (Optional[int]): Maximum number of rows to fetch.
+                When ``None``, the configured ``default_row_limit`` is used.
 
         Returns:
             PySparkDataFrame: The PySpark DataFrame.
@@ -216,7 +242,7 @@ def read_dmo(
         if sf_cli_reader is not None:
             return sf_cli_reader.read_dmo(name, schema, row_limit)
 
-        query = SQL_QUERY_TEMPLATE.format(name, row_limit)
+        query = self._build_query(name, row_limit)
 
         assert self._conn is not None
         pandas_df = self._conn.get_pandas_dataframe(query)
diff --git a/src/datacustomcode/io/reader/sf_cli.py b/src/datacustomcode/io/reader/sf_cli.py
@@ -55,6 +55,7 @@ def __init__(
         spark: SparkSession,
         sf_cli_org: str,
         dataspace: Optional[str] = None,
+        default_row_limit: Optional[int] = None,
     ) -> None:
         """Initialize SFCLIDataCloudReader.
 
@@ -64,9 +65,13 @@ def __init__(
                 (e.g. the alias given to ``sf org login web --alias dev1``).
             dataspace: Optional dataspace identifier.  If ``None`` or
                 ``"default"`` the query runs against the default dataspace.
+            default_row_limit: Default maximum number of rows to fetch when
+                ``row_limit`` is not explicitly passed to read methods. When
+                ``None``, no limit is applied (all rows are returned).
         """
         self.spark = spark
         self.sf_cli_org = sf_cli_org
+        self._default_row_limit = default_row_limit
         self.dataspace = (
             dataspace if dataspace and dataspace != "default" else "default"
         )
@@ -132,12 +137,12 @@ def _get_token(self) -> tuple[str, str]:
         logger.debug(f"Fetched token from SF CLI for org '{self.sf_cli_org}'")
         return access_token, instance_url
 
-    def _execute_query(self, sql: str, row_limit: int) -> pd.DataFrame:
+    def _execute_query(self, sql: str, row_limit: Optional[int]) -> pd.DataFrame:
         """Execute *sql* against the Data Cloud REST endpoint.
 
         Args:
             sql: Base SQL query (no ``LIMIT`` clause).
-            row_limit: Maximum rows to return.
+            row_limit: Maximum rows to return, or ``None`` for no limit.
 
         Returns:
             Pandas DataFrame with query results.
@@ -147,10 +152,16 @@ def _execute_query(self, sql: str, row_limit: int) -> pd.DataFrame:
         """
         access_token, instance_url = self._get_token()
 
+        effective_limit = (
+            row_limit if row_limit is not None else self._default_row_limit
+        )
         url = f"{instance_url}/services/data/{API_VERSION}/ssot/query-sql"
         headers = {"Authorization": f"Bearer {access_token}"}
         params = {"dataspace": self.dataspace}
-        body = {"sql": f"{sql} LIMIT {row_limit}"}
+        if effective_limit is not None:
+            body = {"sql": f"{sql} LIMIT {effective_limit}"}
+        else:
+            body = {"sql": sql}
 
         logger.debug(f"Executing Data Cloud query: {body['sql']}")
 
@@ -190,14 +201,14 @@ def read_dlo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
+        row_limit: Optional[int] = None,
     ) -> PySparkDataFrame:
         """Read a Data Lake Object (DLO) from Data Cloud.
 
         Args:
             name: DLO name.
             schema: Optional explicit schema.
-            row_limit: Maximum rows to fetch.
+            row_limit: Maximum rows to fetch, or ``None`` to use the configured default.
 
         Returns:
             PySpark DataFrame.
@@ -211,14 +222,14 @@ def read_dmo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
+        row_limit: Optional[int] = None,
     ) -> PySparkDataFrame:
         """Read a Data Model Object (DMO) from Data Cloud.
 
         Args:
             name: DMO name.
             schema: Optional explicit schema.
-            row_limit: Maximum rows to fetch.
+            row_limit: Maximum rows to fetch, or ``None`` to use the configured default.
 
         Returns:
             PySpark DataFrame.
diff --git a/tests/io/reader/test_query_api.py b/tests/io/reader/test_query_api.py
@@ -20,6 +20,7 @@
 
 from datacustomcode.io.reader.query_api import (
     SQL_QUERY_TEMPLATE,
+    SQL_QUERY_TEMPLATE_NO_LIMIT,
     QueryAPIDataCloudReader,
 )
 from datacustomcode.io.reader.utils import _pandas_to_spark_schema
@@ -188,6 +189,7 @@ def reader_without_init(self, mock_spark_session):
         with patch.object(QueryAPIDataCloudReader, "__init__", return_value=None):
             reader = QueryAPIDataCloudReader(None)  # None is ignored due to mock
             reader.spark = mock_spark_session
+            reader._default_row_limit = 1000
             yield reader
 
     def test_pandas_to_spark_schema_function(self):
@@ -341,3 +343,83 @@ def test_read_dmo_schema_is_lowercase(
 
         _, schema_arg = reader_without_init.spark.createDataFrame.call_args[0]
         assert all(f.name == f.name.lower() for f in schema_arg.fields)
+
+
+@pytest.mark.usefixtures("patch_all_requests")
+class TestQueryAPIDataCloudReaderNoDefaultLimit:
+    """Tests for deployed behavior where default_row_limit is None (no limit)."""
+
+    @pytest.fixture(scope="class", autouse=True)
+    def patch_all_requests(self, request):
+        patches = []
+        for target in [
+            "requests.get",
+            "requests.post",
+            "requests.session",
+            "requests.adapters.HTTPAdapter.send",
+            "urllib3.connectionpool.HTTPConnectionPool.urlopen",
+        ]:
+            patcher = patch(target)
+            patches.append(patcher)
+            patcher.start()
+
+        def fin():
+            for patcher in patches:
+                patcher.stop()
+
+        request.addfinalizer(fin)
+
+    @pytest.fixture
+    def mock_spark_session(self):
+        spark = MagicMock()
+        spark.createDataFrame.return_value = spark
+        return spark
+
+    @pytest.fixture
+    def mock_pandas_dataframe(self):
+        return pd.DataFrame({"Col1__c": [1, 2], "Col2__c": ["a", "b"]})
+
+    @pytest.fixture
+    def mock_connection(self, mock_pandas_dataframe):
+        mock_conn = MagicMock()
+        mock_conn.get_pandas_dataframe.return_value = mock_pandas_dataframe
+        return mock_conn
+
+    @pytest.fixture
+    def reader_no_limit(self, mock_spark_session):
+        """Reader with no default row limit (simulates deployed environment)."""
+        with patch.object(QueryAPIDataCloudReader, "__init__", return_value=None):
+            reader = QueryAPIDataCloudReader(None)
+            reader.spark = mock_spark_session
+            reader._default_row_limit = None
+            yield reader
+
+    def test_read_dlo_no_limit_when_deployed(
+        self, reader_no_limit, mock_connection, mock_pandas_dataframe
+    ):
+        """When default_row_limit is None and no explicit row_limit, omit LIMIT."""
+        reader_no_limit._conn = mock_connection
+        reader_no_limit.read_dlo("test_dlo")
+        mock_connection.get_pandas_dataframe.assert_called_once_with(
+            SQL_QUERY_TEMPLATE_NO_LIMIT.format("test_dlo")
+        )
+
+    def test_read_dmo_no_limit_when_deployed(
+        self, reader_no_limit, mock_connection, mock_pandas_dataframe
+    ):
+        """When default_row_limit is None and no explicit row_limit, omit LIMIT."""
+        reader_no_limit._conn = mock_connection
+        reader_no_limit.read_dmo("test_dmo")
+        mock_connection.get_pandas_dataframe.assert_called_once_with(
+            SQL_QUERY_TEMPLATE_NO_LIMIT.format("test_dmo")
+        )
+
+    def test_read_dlo_explicit_limit_still_applied_when_deployed(
+        self, reader_no_limit, mock_connection, mock_pandas_dataframe
+    ):
+        """An explicit row_limit always applies, even without a default."""
+        reader_no_limit._conn = mock_connection
+        reader_no_limit.read_dlo("test_dlo", row_limit=500)
+        mock_connection.get_pandas_dataframe.assert_called_once_with(
+            SQL_QUERY_TEMPLATE.format("test_dlo", 500)
+        )
diff --git a/tests/io/reader/test_sf_cli.py b/tests/io/reader/test_sf_cli.py
diff --git a/tests/test_client.py b/tests/test_client.py