IntelPython
diff --git a/‎sdc/datatypes/hpat_pandas_functions.py‎
Lines changed: 45 additions & 20 deletions b/‎sdc/datatypes/hpat_pandas_functions.py‎
Lines changed: 45 additions & 20 deletions
diff --git a/‎sdc/hiframes/pd_dataframe_ext.py‎
Lines changed: 1 addition & 6 deletions b/‎sdc/hiframes/pd_dataframe_ext.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎sdc/hiframes/pd_dataframe_type.py‎
Lines changed: 22 additions & 1 deletion b/‎sdc/hiframes/pd_dataframe_type.py‎
Lines changed: 22 additions & 1 deletion
@@ -39,12 +39,13 @@
 
 from sdc.io.csv_ext import (
     _gen_csv_reader_py_pyarrow_py_func,
-    _gen_csv_reader_py_pyarrow_func_text_dataframe,
+    _gen_pandas_read_csv_func_text,
 )
 from sdc.str_arr_ext import string_array_type
 
 from sdc.hiframes import join, aggregate, sort
 from sdc.types import CategoricalDtypeType, Categorical
+from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype
 
 
 def get_numba_array_types_for_csv(df):
@@ -255,45 +256,69 @@ def sdc_pandas_read_csv(
         usecols = [col.literal_value for col in usecols]
 
     if infer_from_params:
-        # dtype should be constants and is important only for inference from params
+        # dtype is a tuple of format ('A', A_dtype, 'B', B_dtype, ...)
+        # where column names should be constants and is important only for inference from params
         if isinstance(dtype, types.Tuple):
-            assert all(isinstance(key, types.Literal) for key in dtype[::2])
+            assert all(isinstance(key, types.StringLiteral) for key in dtype[::2])
             keys = (k.literal_value for k in dtype[::2])
-
             values = dtype[1::2]
-            values = [v.typing_key if isinstance(v, types.Function) else v for v in values]
-            values = [types.Array(numba.from_dtype(np.dtype(v.literal_value)), 1, 'C')
-                      if isinstance(v, types.Literal) else v for v in values]
-            values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values]
-            values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values]
-            values = [string_array_type if v == str else v for v in values]
-            values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values]
 
-            dtype = dict(zip(keys, values))
+            def _get_df_col_type(dtype):
+                if isinstance(dtype, types.Function):
+                    if dtype.typing_key == int:
+                        return types.Array(types.int_, 1, 'C')
+                    elif dtype.typing_key == float:
+                        return types.Array(types.float64, 1, 'C')
+                    elif dtype.typing_key == str:
+                        return string_array_type
+                    else:
+                        assert False, f"map_dtype_to_col_type: failing to infer column type for dtype={dtype}"
+
+                if isinstance(dtype, types.StringLiteral):
+                    if dtype.literal_value == 'str':
+                        return string_array_type
+                    else:
+                        return types.Array(numba.from_dtype(np.dtype(dtype.literal_value)), 1, 'C')
+
+                if isinstance(dtype, types.NumberClass):
+                    return types.Array(dtype.dtype, 1, 'C')
+
+                if isinstance(dtype, CategoricalDtypeType):
+                    return Categorical(dtype)
+
+            col_types_map = dict(zip(keys, map(_get_df_col_type, values)))
 
     # in case of both are available
     # inferencing from params has priority over inferencing from file
     if infer_from_params:
-        col_names = names
         # all names should be in dtype
-        return_columns = usecols if usecols else names
-        col_typs = [dtype[n] for n in return_columns]
+        col_names = usecols if usecols else names
+        col_types = [col_types_map[n] for n in col_names]
 
     elif infer_from_file:
-        col_names, col_typs = infer_column_names_and_types_from_constant_filename(
+        col_names, col_types = infer_column_names_and_types_from_constant_filename(
             filepath_or_buffer, delimiter, names, usecols, skiprows)
 
     else:
         return None
 
-    dtype_present = not isinstance(dtype, (types.Omitted, type(None)))
+    def _get_py_col_dtype(ctype):
+        """ Re-creates column dtype as python type to be used in read_csv call """
+        dtype = ctype.dtype
+        if ctype == string_array_type:
+            return str
+        if isinstance(ctype, Categorical):
+            return _reconstruct_CategoricalDtype(ctype.pd_dtype)
+        return numpy_support.as_dtype(dtype)
+
+    py_col_dtypes = {cname: _get_py_col_dtype(ctype) for cname, ctype in zip(col_names, col_types)}
 
     # generate function text with signature and returning DataFrame
-    func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_dataframe(
-        col_names, col_typs, dtype_present, usecols, signature)
+    func_text, func_name, global_vars = _gen_pandas_read_csv_func_text(
+        col_names, col_types, py_col_dtypes, usecols, signature)
 
     # compile with Python
-    csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name)
+    csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars)
 
     return csv_reader_py
 
 
@@ -26,7 +26,6 @@
 
 
 import operator
-from typing import NamedTuple
 
 import numba
 from numba import types
@@ -39,7 +38,7 @@
 from numba.core.imputils import impl_ret_new_ref, impl_ret_borrowed
 
 from sdc.hiframes.pd_series_ext import SeriesType
-from sdc.hiframes.pd_dataframe_type import DataFrameType
+from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc
 from sdc.str_ext import string_type
 
 
@@ -54,10 +53,6 @@ def generic_resolve(self, df, attr):
             return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
 
 
-class ColumnLoc(NamedTuple):
-    type_id: int
-    col_id: int
-
 
 def get_structure_maps(col_types, col_names):
     # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
 
@@ -24,6 +24,8 @@
 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # *****************************************************************************
 
+import re
+from typing import NamedTuple
 
 import numba
 from numba import types
@@ -48,7 +50,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, column
         self.has_parent = has_parent
         self.column_loc = column_loc
         super(DataFrameType, self).__init__(
-            name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))
+            name="DataFrameType({}, {}, {}, {})".format(data, index, columns, has_parent))
 
     def copy(self, index=None, has_parent=None):
         # XXX is copy necessary?
@@ -83,6 +85,16 @@ def unify(self, typingctx, other):
     def is_precise(self):
         return all(a.is_precise() for a in self.data) and self.index.is_precise()
 
+    def __repr__(self):
+
+        # To have correct repr of DataFrame we need some changes to what types.Type gives:
+        # (1) e.g. array(int64, 1d, C) should be Array(int64, 1, 'C')
+        # (2) ColumnLoc is not part of DataFrame name, so we need to add it
+        default_repr = super(DataFrameType, self).__repr__()
+        res = re.sub(r'array\((\w+), 1d, C\)', r'Array(\1, 1, \'C\')', default_repr)
+        res = re.sub(r'\)$', f', column_loc={self.column_loc})', res)
+        return res
+
 
 @register_model(DataFrameType)
 class DataFrameModel(models.StructModel):
@@ -104,6 +116,15 @@ def __init__(self, dmm, fe_type):
         super(DataFrameModel, self).__init__(dmm, fe_type, members)
 
 
+class ColumnLoc(NamedTuple):
+    type_id: int
+    col_id: int
+
+
+# FIXME_Numba#3372: add into numba.types to allow returning from objmode
+types.DataFrameType = DataFrameType
+types.ColumnLoc = ColumnLoc
+
 make_attribute_wrapper(DataFrameType, 'data', '_data')
 make_attribute_wrapper(DataFrameType, 'index', '_index')
 make_attribute_wrapper(DataFrameType, 'columns', '_columns')