@@ -29,13 +29,22 @@ struct str_arr_payload {
2929 uint8_t * null_bitmap;
3030};
3131
32+ // XXX: equivalent to payload data model in split_impl.py
33+ struct str_arr_split_view_payload {
34+ uint32_t *index_offsets;
35+ uint32_t *data_offsets;
36+ // uint8_t* null_bitmap;
37+ };
38+
3239// taken from Arrow bin-util.h
3340static constexpr uint8_t kBitmask [] = {1 , 2 , 4 , 8 , 16 , 32 , 64 , 128 };
3441
3542void * init_string (char *, int64_t );
3643void * init_string_const (char * in_str);
3744void dtor_string (std::string** in_str, int64_t size, void * in);
3845void dtor_string_array (str_arr_payload* in_str, int64_t size, void * in);
46+ void dtor_str_arr_split_view (str_arr_split_view_payload* in_str_arr, int64_t size, void * in);
47+ void str_arr_split_view_impl (str_arr_split_view_payload* out_view, int64_t n_strs, uint32_t * offsets, char * data, char sep);
3948const char * get_c_str (std::string* s);
4049const char * get_char_ptr (char c);
4150void * str_concat (std::string* s1, std::string* s2);
@@ -103,6 +112,10 @@ PyMODINIT_FUNC PyInit_hstr_ext(void) {
103112 PyLong_FromVoidPtr ((void *)(&dtor_string)));
104113 PyObject_SetAttrString (m, " dtor_string_array" ,
105114 PyLong_FromVoidPtr ((void *)(&dtor_string_array)));
115+ PyObject_SetAttrString (m, " dtor_str_arr_split_view" ,
116+ PyLong_FromVoidPtr ((void *)(&dtor_str_arr_split_view)));
117+ PyObject_SetAttrString (m, " str_arr_split_view_impl" ,
118+ PyLong_FromVoidPtr ((void *)(&str_arr_split_view_impl)));
106119 PyObject_SetAttrString (m, " get_c_str" ,
107120 PyLong_FromVoidPtr ((void *)(&get_c_str)));
108121 PyObject_SetAttrString (m, " get_char_ptr" ,
@@ -225,6 +238,65 @@ void dtor_string_array(str_arr_payload* in_str_arr, int64_t size, void* in)
225238 return ;
226239}
227240
241+ void dtor_str_arr_split_view (str_arr_split_view_payload* in_str_arr, int64_t size, void * in)
242+ {
243+ // printf("str arr dtor size: %lld\n", in_str_arr->size);
244+ // printf("num chars: %d\n", in_str_arr->offsets[in_str_arr->size]);
245+ delete[] in_str_arr->index_offsets ;
246+ delete[] in_str_arr->data_offsets ;
247+ // if (in_str_arr->null_bitmap != nullptr)
248+ // delete[] in_str_arr->null_bitmap;
249+ return ;
250+ }
251+
252+ void str_arr_split_view_impl (str_arr_split_view_payload* out_view, int64_t n_strs, uint32_t * offsets, char * data, char sep)
253+ {
254+ uint32_t total_chars = offsets[n_strs];
255+ printf (" n_strs %d sep %c total chars:%d\n " , n_strs, sep, total_chars);
256+ // return;
257+ uint32_t * index_offsets = new uint32_t [n_strs+1 ];
258+ std::vector<uint32_t > data_offs;
259+
260+ data_offs.push_back (-1 );
261+ index_offsets[0 ] = 0 ;
262+ // uint32_t curr_data_off = 0;
263+
264+ int data_ind = offsets[0 ];
265+ int str_ind = 0 ;
266+ // while there are chars to consume, equal since the first if will consume it
267+ while (data_ind <= total_chars)
268+ {
269+ // string has finished
270+ if (data_ind == offsets[str_ind+1 ])
271+ {
272+ data_offs.push_back (data_ind);
273+ index_offsets[str_ind+1 ] = data_offs.size ();
274+ str_ind++;
275+ if (str_ind == n_strs) break ; // all finished
276+ continue ; // stay on same data_ind for start of next string
277+ }
278+ if (data[data_ind] == sep)
279+ {
280+ data_offs.push_back (data_ind);
281+ }
282+ data_ind++;
283+ }
284+ out_view->index_offsets = index_offsets;
285+ out_view->data_offsets = new uint32_t [data_offs.size ()];
286+ // TODO: avoid copy
287+ std::copy (data_offs.cbegin (), data_offs.cend (), out_view->data_offsets );
288+
289+ printf (" index_offsets: " );
290+ for (int i=0 ; i<=n_strs; i++)
291+ printf (" %d " , index_offsets[i]);
292+ printf (" \n " );
293+ printf (" data_offsets: " );
294+ for (int i=0 ; i<data_offs.size (); i++)
295+ printf (" %d " , data_offs[i]);
296+ printf (" \n " );
297+ return ;
298+ }
299+
228300const char * get_c_str (std::string* s)
229301{
230302 // printf("in get %s\n", s->c_str());
@@ -507,7 +579,7 @@ void string_array_from_sequence(PyObject * obj, int64_t * no_strings, uint32_t *
507579 PyGILState_Release (gilstate);
508580 return ;
509581 }
510-
582+
511583 *no_strings = -1 ;
512584 *offset_table = NULL ;
513585 *buffer = NULL ;
0 commit comments