Merging from main and adding testcase

ritaagarwala-sf · ritaagarwala-sf · commit aa87f5506203 · 2026-04-20T23:35:57.000+05:30
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## 2.0.0
+
+### Breaking Changes
+
+- **Removed the `row_limit` parameter from `read_dlo()` and `read_dmo()`.**
+
+  These methods no longer accept a `row_limit` argument. When running locally, reads are automatically capped at 1000 rows to prevent accidentally fetching large datasets during development. When deployed to Data Cloud, no limit is applied and all records are returned.
+
+  **Why:** The `row_limit` parameter duplicated PySpark's built-in `.limit()` and created a behavioral difference between local and deployed environments. The 1000-row safety net is now handled internally via the `default_row_limit` setting in `config.yaml`, and deployed environments naturally omit it.
+
+  **Migration:** Remove any `row_limit` arguments from your `read_dlo()` and `read_dmo()` calls. If you need a specific number of rows, use PySpark's `.limit()` on the returned DataFrame:
+
+  ```python
+  # Before
+  df = client.read_dlo("MyObject__dll", row_limit=500)
+
+  # After
+  df = client.read_dlo("MyObject__dll").limit(500)
+  ```
+
 ## 1.0.0
 
 ### Breaking Changes
diff --git a/src/datacustomcode/client.py b/src/datacustomcode/client.py
@@ -185,31 +185,29 @@ def _new_function_client(cls) -> Client:
         )
         return cls._instance
 
-    def read_dlo(self, name: str, row_limit: int = 1000) -> PySparkDataFrame:
+    def read_dlo(self, name: str) -> PySparkDataFrame:
         """Read a DLO from Data Cloud.
 
         Args:
             name: The name of the DLO to read.
-            row_limit: Maximum number of rows to fetch (default: 1000).
 
         Returns:
             A PySpark DataFrame containing the DLO data.
         """
         self._record_dlo_access(name)
-        return self._reader.read_dlo(name, row_limit=row_limit)  # type: ignore[no-any-return]
+        return self._reader.read_dlo(name)  # type: ignore[no-any-return]
 
-    def read_dmo(self, name: str, row_limit: int = 1000) -> PySparkDataFrame:
+    def read_dmo(self, name: str) -> PySparkDataFrame:
         """Read a DMO from Data Cloud.
 
         Args:
             name: The name of the DMO to read.
-            row_limit: Maximum number of rows to fetch (default: 1000).
 
         Returns:
             A PySpark DataFrame containing the DMO data.
         """
         self._record_dmo_access(name)
-        return self._reader.read_dmo(name, row_limit=row_limit)  # type: ignore[no-any-return]
+        return self._reader.read_dmo(name)  # type: ignore[no-any-return]
 
     def write_to_dlo(
         self, name: str, dataframe: PySparkDataFrame, write_mode: WriteMode, **kwargs
diff --git a/src/datacustomcode/config.yaml b/src/datacustomcode/config.yaml
@@ -2,6 +2,7 @@ reader_config:
   type_config_name: QueryAPIDataCloudReader
   options:
     credentials_profile: default
+    default_row_limit: 1000
 
 writer_config:
   type_config_name: PrintDataCloudWriter
diff --git a/src/datacustomcode/io/reader/base.py b/src/datacustomcode/io/reader/base.py
@@ -33,13 +33,11 @@ def read_dlo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
     ) -> PySparkDataFrame: ...
 
     @abstractmethod
     def read_dmo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
     ) -> PySparkDataFrame: ...
diff --git a/src/datacustomcode/io/reader/query_api.py b/src/datacustomcode/io/reader/query_api.py
@@ -37,6 +37,7 @@
 
 
 SQL_QUERY_TEMPLATE: Final = "SELECT * FROM {} LIMIT {}"
+SQL_QUERY_TEMPLATE_NO_LIMIT: Final = "SELECT * FROM {}"
 
 
 def create_cdp_connection(
@@ -122,6 +123,7 @@ def __init__(
         credentials_profile: str = "default",
         dataspace: Optional[str] = None,
         sf_cli_org: Optional[str] = None,
+        default_row_limit: Optional[int] = None,
     ) -> None:
         """Initialize QueryAPIDataCloudReader.
 
@@ -137,8 +139,12 @@ def __init__(
                 reader delegates to :class:`SFCLIDataCloudReader` which calls
                 the Data Cloud REST API directly using the token obtained from
                 ``sf org display``, bypassing the CDP token-exchange flow.
+            default_row_limit: Maximum number of rows to fetch automatically.
+                When ``None``, no limit is applied (all rows are returned).
+                Set via ``default_row_limit`` in ``config.yaml`` reader options.
         """
         self.spark = spark
+        self._default_row_limit = default_row_limit
         if sf_cli_org:
             logger.debug(
                 f"Initializing QueryAPIDataCloudReader with SF CLI org '{sf_cli_org}'"
@@ -147,6 +153,7 @@ def __init__(
                 spark=spark,
                 sf_cli_org=sf_cli_org,
                 dataspace=dataspace,
+                default_row_limit=default_row_limit,
             )
             self._conn = None
         else:
@@ -158,19 +165,30 @@ def __init__(
             )
             self._conn = create_cdp_connection(credentials, dataspace)
 
+    def _build_query(self, name: str) -> str:
+        """Build a SQL query, applying the configured default row limit.
+
+        Args:
+            name: Object name to query.
+
+        Returns:
+            SQL query string.
+        """
+        if self._default_row_limit is not None:
+            return SQL_QUERY_TEMPLATE.format(name, self._default_row_limit)
+        return SQL_QUERY_TEMPLATE_NO_LIMIT.format(name)
+
     def read_dlo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
     ) -> PySparkDataFrame:
         """
-        Read a Data Lake Object (DLO) from the Data Cloud, limited to a number of rows.
+        Read a Data Lake Object (DLO) from the Data Cloud.
 
         Args:
             name (str): The name of the DLO.
             schema (Optional[Union[AtomicType, StructType, str]]): Schema of the DLO.
-            row_limit (int): Maximum number of rows to fetch.
 
         Returns:
             PySparkDataFrame: The PySpark DataFrame.
@@ -179,9 +197,9 @@ def read_dlo(
             self, "_sf_cli_reader", None
         )
         if sf_cli_reader is not None:
-            return sf_cli_reader.read_dlo(name, schema, row_limit)  # type: ignore[no-any-return]
+            return sf_cli_reader.read_dlo(name, schema) # type: ignore[no-any-return]
 
-        query = SQL_QUERY_TEMPLATE.format(name, row_limit)
+        query = self._build_query(name)
 
         assert self._conn is not None
         pandas_df = self._conn.get_pandas_dataframe(query)
@@ -197,15 +215,13 @@ def read_dmo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
     ) -> PySparkDataFrame:
         """
-        Read a Data Model Object (DMO) from the Data Cloud, limited to a number of rows.
+        Read a Data Model Object (DMO) from the Data Cloud.
 
         Args:
             name (str): The name of the DMO.
             schema (Optional[Union[AtomicType, StructType, str]]): Schema of the DMO.
-            row_limit (int): Maximum number of rows to fetch.
 
         Returns:
             PySparkDataFrame: The PySpark DataFrame.
@@ -214,9 +230,9 @@ def read_dmo(
             self, "_sf_cli_reader", None
         )
         if sf_cli_reader is not None:
-            return sf_cli_reader.read_dmo(name, schema, row_limit)  # type: ignore[no-any-return]
+            return sf_cli_reader.read_dmo(name, schema) # type: ignore[no-any-return]
 
-        query = SQL_QUERY_TEMPLATE.format(name, row_limit)
+        query = self._build_query(name)
 
         assert self._conn is not None
         pandas_df = self._conn.get_pandas_dataframe(query)
diff --git a/src/datacustomcode/io/reader/sf_cli.py b/src/datacustomcode/io/reader/sf_cli.py
@@ -55,6 +55,7 @@ def __init__(
         spark: SparkSession,
         sf_cli_org: str,
         dataspace: Optional[str] = None,
+        default_row_limit: Optional[int] = None,
     ) -> None:
         """Initialize SFCLIDataCloudReader.
 
@@ -64,9 +65,13 @@ def __init__(
                 (e.g. the alias given to ``sf org login web --alias dev1``).
             dataspace: Optional dataspace identifier.  If ``None`` or
                 ``"default"`` the query runs against the default dataspace.
+            default_row_limit: Maximum number of rows to fetch automatically.
+                When ``None``, no limit is applied (all rows are returned).
+                Set via ``default_row_limit`` in ``config.yaml`` reader options.
         """
         self.spark = spark
         self.sf_cli_org = sf_cli_org
+        self._default_row_limit = default_row_limit
         self.dataspace = (
             dataspace if dataspace and dataspace != "default" else "default"
         )
@@ -132,12 +137,14 @@ def _get_token(self) -> tuple[str, str]:
         logger.debug(f"Fetched token from SF CLI for org '{self.sf_cli_org}'")
         return access_token, instance_url
 
-    def _execute_query(self, sql: str, row_limit: int) -> pd.DataFrame:
+    def _execute_query(self, sql: str) -> pd.DataFrame:
         """Execute *sql* against the Data Cloud REST endpoint.
 
+        The configured ``default_row_limit`` is automatically appended as a
+        ``LIMIT`` clause when set (typically for local development).
+
         Args:
             sql: Base SQL query (no ``LIMIT`` clause).
-            row_limit: Maximum rows to return.
 
         Returns:
             Pandas DataFrame with query results.
@@ -150,7 +157,10 @@ def _execute_query(self, sql: str, row_limit: int) -> pd.DataFrame:
         url = f"{instance_url}/services/data/{API_VERSION}/ssot/query-sql"
         headers = {"Authorization": f"Bearer {access_token}"}
         params = {"dataspace": self.dataspace}
-        body = {"sql": f"{sql} LIMIT {row_limit}"}
+        if self._default_row_limit is not None:
+            body = {"sql": f"{sql} LIMIT {self._default_row_limit}"}
+        else:
+            body = {"sql": sql}
 
         logger.debug(f"Executing Data Cloud query: {body['sql']}")
 
@@ -190,19 +200,17 @@ def read_dlo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
     ) -> PySparkDataFrame:
         """Read a Data Lake Object (DLO) from Data Cloud.
 
         Args:
             name: DLO name.
             schema: Optional explicit schema.
-            row_limit: Maximum rows to fetch.
 
         Returns:
             PySpark DataFrame.
         """
-        pandas_df = self._execute_query(f"SELECT * FROM {name}", row_limit)
+        pandas_df = self._execute_query(f"SELECT * FROM {name}")
         if not schema:
             schema = _pandas_to_spark_schema(pandas_df)
         return self.spark.createDataFrame(pandas_df, schema)
@@ -211,19 +219,17 @@ def read_dmo(
         self,
         name: str,
         schema: Union[AtomicType, StructType, str, None] = None,
-        row_limit: int = 1000,
     ) -> PySparkDataFrame:
         """Read a Data Model Object (DMO) from Data Cloud.
 
         Args:
             name: DMO name.
             schema: Optional explicit schema.
-            row_limit: Maximum rows to fetch.
 
         Returns:
             PySpark DataFrame.
         """
-        pandas_df = self._execute_query(f"SELECT * FROM {name}", row_limit)
+        pandas_df = self._execute_query(f"SELECT * FROM {name}")
         if not schema:
             schema = _pandas_to_spark_schema(pandas_df)
         return self.spark.createDataFrame(pandas_df, schema)
diff --git a/src/datacustomcode/io/writer/print.py b/src/datacustomcode/io/writer/print.py
@@ -90,7 +90,7 @@ def validate_dataframe_columns_against_dlo(
             schema.
         """
         # Get DLO schema (no data, just schema)
-        dlo_df = self.reader.read_dlo(dlo_name, row_limit=0)
+        dlo_df = self.reader.read_dlo(dlo_name).limit(0)
         dlo_columns = set(dlo_df.columns)
         df_columns = set(dataframe.columns)
 
diff --git a/src/datacustomcode/llm_gateway/types/generate_text_request_builder.py b/src/datacustomcode/llm_gateway/types/generate_text_request_builder.py
@@ -56,7 +56,11 @@ def set_localization(
         if localization is not None:
             self._localization = localization
         elif locale is not None:
-            self._localization = {"defaultLocale": locale}
+            self._localization = {
+                "defaultLocale": locale,
+                "inputLocales": [{"locale": locale, "probability": 1.0}],
+                "expectedLocales": [locale],
+            }
         else:
             raise ValueError("Must provide either localization or locale")
 
diff --git a/tests/io/reader/test_query_api.py b/tests/io/reader/test_query_api.py
diff --git a/tests/io/reader/test_sf_cli.py b/tests/io/reader/test_sf_cli.py
diff --git a/tests/io/writer/test_print.py b/tests/io/writer/test_print.py
diff --git a/tests/test_client.py b/tests/test_client.py