Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 10dd390

Browse files
PokhodenkoSAshssf
authored andcommitted
Benchmark for read_csv() via PyArrow (#332)
* Allow variable arguments in calc_compilation() and get_times() functions * Benchmark test for read_csv()
1 parent 844eded commit 10dd390

4 files changed

Lines changed: 159 additions & 4 deletions

File tree

sdc/tests/tests_perf/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
from sdc.tests.tests_perf.test_perf_unicode import *
22
from sdc.tests.tests_perf.test_perf_series_str import *
33
from sdc.tests.tests_perf.test_perf_series import *
4+
from . import test_perf_read_csv

sdc/tests/tests_perf/gen_csv.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2019, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
import csv
29+
30+
31+
def generate(rows, headers, providers, file_name):
32+
"""Generate CSV file.
33+
rows: rows count
34+
headers: list of column names
35+
providers: list of functions whic provide values for corresponding column
36+
file_name:
37+
"""
38+
39+
assert len(headers) == len(providers)
40+
41+
with open(file_name, 'wt') as f:
42+
writer = csv.DictWriter(f, fieldnames=headers)
43+
writer.writeheader()
44+
for i in range(rows):
45+
writer.writerow({k: p() for k, p in zip(headers, providers)})
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2019, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
import time
29+
import random
30+
31+
import pandas
32+
import sdc
33+
34+
from .test_perf_base import TestBase
35+
from .test_perf_utils import calc_compilation, get_times
36+
37+
from .gen_csv import generate
38+
39+
40+
def generate_csv():
41+
"""Generate CSV file and return file name."""
42+
rows = 10**5
43+
file_name = f"data_{rows}.csv"
44+
r = random.Random(0) # seed=0
45+
generate(rows,
46+
['A', 'B', 'C'],
47+
[
48+
lambda: int(r.random() * 10000),
49+
lambda: r.uniform(-1.0, 1.0),
50+
lambda: r.uniform(-1.0, 1.0)
51+
],
52+
file_name
53+
)
54+
return file_name
55+
56+
57+
def make_func(file_name):
58+
"""Create function for testing.
59+
It is necessary because file_name should be constant for jitted function.
60+
"""
61+
def _function():
62+
start = time.time()
63+
df = pandas.read_csv(file_name)
64+
return time.time() - start, df
65+
return _function
66+
67+
68+
class TestPandasReadCSV(TestBase):
69+
70+
@classmethod
71+
def setUpClass(cls):
72+
super().setUpClass()
73+
cls.generated_file = generate_csv()
74+
75+
def _test_jitted(self, pyfunc, record, *args, **kwargs):
76+
# compilation time
77+
record["compile_results"] = calc_compilation(pyfunc, *args, **kwargs)
78+
79+
sdc_func = sdc.jit(pyfunc)
80+
81+
# Warming up
82+
sdc_func(*args, **kwargs)
83+
84+
# execution and boxing time
85+
record["test_results"], record["boxing_results"] = \
86+
get_times(sdc_func, *args, **kwargs)
87+
88+
def _test_python(self, pyfunc, record, *args, **kwargs):
89+
record["test_results"], _ = \
90+
get_times(pyfunc, *args, **kwargs)
91+
92+
def _test_case(self, pyfunc, name):
93+
base = {
94+
"test_name": name,
95+
"data_size": 10**5,
96+
}
97+
98+
record = base.copy()
99+
record["test_type"] = 'SDC'
100+
self._test_jitted(pyfunc, record)
101+
self.test_results.add(**record)
102+
103+
record = base.copy()
104+
record["test_type"] = 'Python'
105+
self._test_python(pyfunc, record)
106+
self.test_results.add(**record)
107+
108+
def test_read_csv(self):
109+
self._test_case(make_func(self.generated_file), 'read_csv')

sdc/tests/tests_perf/test_perf_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -169,24 +169,24 @@ def calc_compile_time(func, *args, **kwargs):
169169
return calc_time(func, *args, **kwargs) - calc_time(func, *args, **kwargs)
170170

171171

172-
def calc_compilation(pyfunc, data, iter_number=5):
172+
def calc_compilation(pyfunc, *args, iter_number=5):
173173
"""Calculate compile time several times."""
174174
compile_times = []
175175
for _ in range(iter_number):
176176
with do_jit(pyfunc) as cfunc:
177-
compile_time = calc_compile_time(cfunc, data)
177+
compile_time = calc_compile_time(cfunc, *args)
178178
compile_times.append(compile_time)
179179

180180
return compile_times
181181

182182

183-
def get_times(f, test_data, iter_number=5):
183+
def get_times(f, *args, iter_number=5):
184184
"""Get time of boxing+unboxing and internal execution"""
185185
exec_times = []
186186
boxing_times = []
187187
for _ in range(iter_number):
188188
ext_start = time.time()
189-
int_result, _ = f(test_data)
189+
int_result, _ = f(*args)
190190
ext_finish = time.time()
191191

192192
exec_times.append(int_result)

0 commit comments

Comments
 (0)