Skip to content
37 changes: 6 additions & 31 deletions mssql_python/pybind/ddbc_bindings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4780,9 +4780,8 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules,
ColumnBuffers buffers(numCols, fetchSize);

if (!hasLobColumns && fetchSize > 0) {
// Bind columns — Arrow always uses SQL_C_CHAR for VARCHAR because
// it processes raw byte buffers directly, not via Python codecs.
ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_CHAR);
// Always request WCHARs so we don't have to deal with CHAR encodings
ret = SQLBindColums(hStmt, buffers, columnNames, numCols, fetchSize, SQL_C_WCHAR);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ffelixg , I think we should avoid hardcoding SQL_C_WCHAR here. With the recent design update introduced in PR #495(#495) for CP1252 character set handling, we’ve moved toward a more flexible approach. It would be good to align with that design for Arrow support as well to ensure consistency and maintainability.

if (!SQL_SUCCEEDED(ret)) {
LOG("Error when binding columns");
return ret;
Expand Down Expand Up @@ -4841,20 +4840,12 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules,
}
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR: {
ret = GetDataVar(hStmt, idxCol + 1, SQL_C_CHAR,
buffers.charBuffers[idxCol],
buffers.indicators[idxCol].data());
if (!SQL_SUCCEEDED(ret)) {
LOG("Error fetching CHAR LOB for column %d", idxCol + 1);
return ret;
}
break;
}
case SQL_LONGVARCHAR:
Comment thread
ffelixg marked this conversation as resolved.
case SQL_SS_XML:
case SQL_WCHAR:
case SQL_WVARCHAR:
case SQL_WLONGVARCHAR: {
// Always request WCHARs so we don't have to deal with CHAR encodings.
ret = GetDataVar(hStmt, idxCol + 1, SQL_C_WCHAR,
buffers.wcharBuffers[idxCol],
buffers.indicators[idxCol].data());
Expand Down Expand Up @@ -5093,28 +5084,12 @@ SQLRETURN FetchArrowBatch_wrap(SqlHandlePtr StatementHandle, py::list& capsules,
}
case SQL_CHAR:
case SQL_VARCHAR:
case SQL_LONGVARCHAR: {
#if defined(__APPLE__) || defined(__linux__)
uint64_t fetchBufferSize = columnSize * 4 + 1 /*null-terminator*/;
#else
uint64_t fetchBufferSize = columnSize + 1 /*null-terminator*/;
#endif
auto target_vec = &arrowColumnProducer->varData;
auto start = arrowColumnProducer->varVal[idxRowArrow];
while (target_vec->size() < start + dataLen) {
target_vec->resize(target_vec->size() * 2);
}

std::memcpy(&(*target_vec)[start],
&buffers.charBuffers[idxCol][idxRowSql * fetchBufferSize],
dataLen);
arrowColumnProducer->varVal[idxRowArrow + 1] = start + dataLen;
break;
}
case SQL_LONGVARCHAR:
Comment thread
ffelixg marked this conversation as resolved.
case SQL_SS_XML:
case SQL_WCHAR:
case SQL_WVARCHAR:
case SQL_WLONGVARCHAR: {
// We have previously fetched these as WCHARs, even for SQL_CHAR types.
assert(dataLen % sizeof(SQLWCHAR) == 0);
auto dataLenW = dataLen / sizeof(SQLWCHAR);
auto wcharSource =
Expand Down
78 changes: 78 additions & 0 deletions tests/test_004_cursor_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,84 @@ def test_arrow_long_string(cursor: mssql_python.Cursor):
assert batch.column(0).to_pylist() == [long_string]


@pytest.mark.parametrize(
"sql_type",
[
pytest.param("char(32)", id="char"),
pytest.param("varchar(32)", id="varchar"),
],
)
def test_arrow_char_utf8_collation_unicode(cursor: mssql_python.Cursor, sql_type: str):
table = "#t_arrow_char_decode"
collation = "Latin1_General_100_CI_AS_SC_UTF8"
expected = [
"Grüße",
"你好😀",
"こんにちは",
"Привет",
"Hello 世界",
"😀😃😄😁",
"",
None,
]

try:
cursor.execute(
f"create table {table} (id int primary key, v {sql_type} collate {collation})"
)
except Exception as exc:
pytest.skip(f"UTF-8 collation '{collation}' not supported: {exc}")

try:
for index, value in enumerate(expected, start=1):
cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value)
tbl = cursor.execute(f"select v from {table} order by id").arrow()
assert tbl.column(0).type.equals(pa.large_string())
for expected_val, actual_val in zip(expected, tbl.column(0).to_pylist(), strict=True):
if actual_val is not None:
actual_val = actual_val.strip()
assert expected_val == actual_val
finally:
cursor.execute(f"drop table if exists {table}")


@pytest.mark.parametrize(
"sql_type",
[
pytest.param("char(32)", id="char"),
pytest.param("varchar(32)", id="varchar"),
pytest.param("text", id="text"),
],
)
def test_arrow_char_cp1252_collation_unicode(cursor: mssql_python.Cursor, sql_type: str):
table = "#t_arrow_char_decode"
collation = "SQL_Latin1_General_CP1_CI_AS"
expected = [
"Grüße",
"café René!",
"naïve café",
"Español",
"Müller-Öztürk",
"Françoise",
"",
None,
]

cursor.execute(f"create table {table} (id int primary key, v {sql_type} collate {collation})")

try:
for index, value in enumerate(expected, start=1):
cursor.execute(f"insert into {table} (id, v) values (?, ?)", index, value)
tbl = cursor.execute(f"select v from {table} order by id").arrow()
assert tbl.column(0).type.equals(pa.large_string())
for expected_val, actual_val in zip(expected, tbl.column(0).to_pylist(), strict=True):
if actual_val is not None:
actual_val = actual_val.strip()
assert expected_val == actual_val
finally:
cursor.execute(f"drop table if exists {table}")

Comment thread
ffelixg marked this conversation as resolved.

def test_rownumber_arrow_batch_interleaved_fetchmany(cursor: mssql_python.Cursor):
"""Verify that arrow_batch and fetchmany can be interleaved
on the same result set with correct rownumber tracking and values."""
Expand Down