Skip to content

Commit 2528f32

Browse files
piyushc01avamingli
authored andcommitted
Fixing gpcheckperf failure on -V with -f option (#14310)
Gpcheckperf was throwing an exception when run with -f and -V option together. This was happening at with -V option, gpssh command outpuot is having few extra lines which are causing trouble while parsing the output. With this change, provided flag to skip verbose mode when running ssh command and used this non-verbose SSH mode to execute the command when getting host-name. Corrected run-time errors due to python3 in gpcheckperf. Also added the test case to cover the scenario relating the host file and -V option.
1 parent 2ff7b0c commit 2528f32

3 files changed

Lines changed: 210 additions & 17 deletions

File tree

gpMgmt/bin/gpcheckperf

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,9 +82,9 @@ def strcmd(cmd):
8282
return reduce(lambda x, y: x + ' ' + y, map(lambda x: x.find(' ') > 0 and "'" + x + "'" or x, cmd))
8383

8484

85-
def gpssh(cmd):
85+
def gpssh(cmd, call_verbose=True):
8686
c = ['%s/bin/gpssh' % GPHOME]
87-
if GV.opt['-V']:
87+
if GV.opt['-V'] and call_verbose:
8888
c.append('-v')
8989
if GV.opt['-f']:
9090
c.append('-f')
@@ -541,7 +541,7 @@ def spawnNetperfTestBetween(x, y, netperf_path, netserver_port, sec=5):
541541
x, cmd]
542542
proc = None
543543
try:
544-
if GV.opt['-v']:
544+
if GV.opt['-v'] or GV.opt['-V']:
545545
print('[Info]', strcmd(c))
546546
proc = subprocess.Popen(c, stdout=subprocess.PIPE)
547547
except KeyboardInterrupt:
@@ -740,13 +740,23 @@ def get_host_map(hostlist):
740740
uniqhosts = dict() # unique host list
741741

742742
# get list of hostnames
743-
rc, out = gpssh('hostname')
743+
# disabling verbose mode for gpssh as it is adding extra lines of output
744+
rc, out = gpssh('hostname', False)
745+
744746
if not rc:
745747
raise Exception('Encountered error running hostname')
746748

749+
''' Sample output:
750+
[sdw1] sdw1
751+
[sdw2] sdw2
752+
'''
753+
747754
# get unique hostname list
748755
for line in out.splitlines():
749-
seg, host = line.translate(None, '[]').split()
756+
seg, host = line.translate(str.maketrans('','','[]')).split()
757+
# removing \r and b coming in the output of the command in hostname
758+
host = host.replace('\\r\'', '')
759+
host = host.replace('b\'', '')
750760
uniqhosts[host] = seg
751761

752762
# get list of segments associated with each host (can't use gpssh since it de-dupes hosts)
@@ -755,7 +765,8 @@ def get_host_map(hostlist):
755765

756766
proc = None
757767
try:
758-
if GV.opt['-v']: print('[Info]', strcmd(cmd))
768+
if GV.opt['-v'] or GV.opt['-V']:
769+
print('[Info]', strcmd(cmd))
759770
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
760771
out = proc.stdout.read(-1)
761772
rc = proc.wait()
@@ -781,7 +792,7 @@ def runNetPerfTestMatrix():
781792
'''
782793
(netperf, hostlist, netserver_port) = setupNetPerfTest()
783794
if not netperf:
784-
return None
795+
return None, None
785796

786797
# dict() of seglist[segname] = hostname, uniqhosts[hostname] = 1 segment name
787798
seglist, uniqhosts = get_host_map(hostlist)
@@ -807,19 +818,21 @@ def runNetPerfTestMatrix():
807818

808819

809820
def printMatrixResult(result, seglist):
821+
if not result:
822+
return
810823
print('Full matrix netperf bandwidth test')
811824

812825
# sum up Rx/Tx rate for each host
813826
netTx = dict()
814827
netRx = dict()
815828
for h in result:
816-
if netTx.has_key(h[0]):
829+
if h[0] in netTx:
817830
netTx[h[0]] += float(h[6])
818831
else:
819832
netTx[h[0]] = float(h[6])
820833

821834
# netRx requires that we lookup the hostname for a given segment name
822-
if netRx.has_key(seglist[h[1]]):
835+
if seglist[h[1]] in netRx:
823836
netRx[seglist[h[1]]] += float(h[6])
824837
else:
825838
netRx[seglist[h[1]]] = float(h[6])
@@ -850,7 +863,7 @@ def printMatrixResult(result, seglist):
850863

851864
copy = n[:]
852865
copy.sort()
853-
median = copy[len(copy) / 2]
866+
median = copy[len(copy) // 2]
854867

855868
print('')
856869
print('Summary:')
@@ -863,6 +876,8 @@ def printMatrixResult(result, seglist):
863876

864877

865878
def printNetResult(result):
879+
if not result:
880+
return
866881
print('Netperf bisection bandwidth test')
867882
for h in result:
868883
print('%s -> %s = %f' % (h[0], h[1], h[6]))
@@ -894,6 +909,8 @@ def printNetResult(result):
894909

895910

896911
def printResult(title, result):
912+
if not result:
913+
return
897914
totTime = 0
898915
totBytes = 0
899916
totMBPS = 0

gpMgmt/test/behave/mgmt_utils/gpcheckperf.feature

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,40 @@ Feature: Tests for gpcheckperf
1515
Then gpcheckperf should return a return code of 0
1616
And gpcheckperf should print "avg = " to stdout
1717
And gpcheckperf should not print "NOTICE: -t is deprecated " to stdout
18+
19+
@concourse_cluster
20+
Scenario: gpcheckperf runs tests by passing hostfile in super verbose mode
21+
Given the database is running
22+
And create a gpcheckperf input host file
23+
When the user runs "gpcheckperf -f /tmp/hostfile1 -r M -d /data/gpdata/ --duration=3m -V"
24+
Then gpcheckperf should return a return code of 0
25+
And gpcheckperf should print "Full matrix netperf bandwidth test" to stdout
26+
And gpcheckperf should not print "IndexError: list index out of range" to stdout
27+
28+
@concourse_cluster
29+
Scenario: gpcheckperf runs tests by passing hostfile in verbose mode
30+
Given the database is running
31+
And create a gpcheckperf input host file
32+
When the user runs "gpcheckperf -f /tmp/hostfile1 -r M -d /data/gpdata/ --duration=3m -v"
33+
Then gpcheckperf should return a return code of 0
34+
And gpcheckperf should print "Full matrix netperf bandwidth test" to stdout
35+
And gpcheckperf should not print "IndexError: list index out of range" to stdout
36+
37+
@concourse_cluster
38+
Scenario: gpcheckperf runs tests by passing hostfile in regular mode
39+
Given the database is running
40+
And create a gpcheckperf input host file
41+
When the user runs "gpcheckperf -f /tmp/hostfile1 -r M -d /data/gpdata/ --duration=3m"
42+
Then gpcheckperf should return a return code of 0
43+
And gpcheckperf should print "Full matrix netperf bandwidth test" to stdout
44+
And gpcheckperf should not print "IndexError: list index out of range" to stdout
45+
46+
@concourse_cluster
47+
Scenario: gpcheckperf does not throws typeerror when run with single host
48+
Given the database is running
49+
And create a gpcheckperf input host file
50+
When the user runs "gpcheckperf -h sdw1 -r M -d /data/gpdata/ --duration=3m"
51+
Then gpcheckperf should return a return code of 0
52+
And gpcheckperf should print "single host only - abandon netperf test" to stdout
53+
And gpcheckperf should not print "TypeError:" to stdout
54+

gpMgmt/test/behave/mgmt_utils/steps/mgmt_utils.py

Lines changed: 146 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def show_all_installed(gphome):
4646
name = x[0].lower()
4747
if 'ubuntu' in name:
4848
return "dpkg --get-selections --admindir=%s/share/packages/database/deb | awk '{print $1}'" % gphome
49-
elif 'centos' in name or 'rhel' in name:
49+
elif 'centos' in name or 'rhel' in name or 'rocky' in name or 'ol' in name:
5050
return "rpm -qa --dbpath %s/share/packages/database" % gphome
5151
else:
5252
raise Exception('UNKNOWN platform: %s' % str(x))
@@ -56,7 +56,7 @@ def remove_native_package_command(gphome, full_gppkg_name):
5656
name = x[0].lower()
5757
if 'ubuntu' in name:
5858
return 'fakeroot dpkg --force-not-root --log=/dev/null --instdir=%s --admindir=%s/share/packages/database/deb -r %s' % (gphome, gphome, full_gppkg_name)
59-
elif 'centos' in name or 'rhel' in name:
59+
elif 'centos' in name or 'rhel' in name or 'rocky' in name or 'ol' in name:
6060
return 'rpm -e %s --dbpath %s/share/packages/database' % (full_gppkg_name, gphome)
6161
else:
6262
raise Exception('UNKNOWN platform: %s' % str(x))
@@ -438,12 +438,30 @@ def impl(context, content):
438438
dburl = dbconn.DbURL(hostname=host, port=port, dbname='template1')
439439
wait_for_desired_query_result(dburl, query, desired_result, utility=True)
440440

441+
442+
@given('the user just waits until recovery_progress.file is created in {logdir}')
443+
@when('the user just waits until recovery_progress.file is created in {logdir}')
444+
@then('the user just waits until recovery_progress.file is created in {logdir}')
445+
def impl(context, logdir):
446+
attempt = 0
447+
num_retries = 6000
448+
log_dir = _get_gpAdminLogs_directory() if logdir == 'gpAdminLogs' else logdir
449+
recovery_progress_file = '{}/recovery_progress.file'.format(log_dir)
450+
while attempt < num_retries:
451+
attempt += 1
452+
if os.path.exists(recovery_progress_file):
453+
return
454+
time.sleep(0.1)
455+
if attempt == num_retries:
456+
raise Exception('Timed out after {} retries'.format(num_retries))
457+
458+
441459
@given('the user waits until recovery_progress.file is created in {logdir} and verifies its format')
442460
@when('the user waits until recovery_progress.file is created in {logdir} and verifies its format')
443461
@then('the user waits until recovery_progress.file is created in {logdir} and verifies its format')
444462
def impl(context, logdir):
445463
attempt = 0
446-
num_retries = 60000
464+
num_retries = 6000
447465
log_dir = _get_gpAdminLogs_directory() if logdir == 'gpAdminLogs' else logdir
448466
recovery_progress_file = '{}/recovery_progress.file'.format(log_dir)
449467
while attempt < num_retries:
@@ -459,7 +477,7 @@ def impl(context, logdir):
459477
return
460478
else:
461479
raise Exception('File present but incorrect format line "{}"'.format(line))
462-
time.sleep(0.01)
480+
time.sleep(0.1)
463481
if attempt == num_retries:
464482
raise Exception('Timed out after {} retries'.format(num_retries))
465483

@@ -3685,8 +3703,8 @@ def impl(context, command, input):
36853703
context.error_message = stderr.decode()
36863704

36873705
def are_on_different_subnets(primary_hostname, mirror_hostname):
3688-
primary_broadcast = check_output(['ssh', '-n', primary_hostname, "/sbin/ip addr show eth0 | grep 'inet .* brd' | awk '{ print $4 }'"])
3689-
mirror_broadcast = check_output(['ssh', '-n', mirror_hostname, "/sbin/ip addr show eth0 | grep 'inet .* brd' | awk '{ print $4 }'"])
3706+
primary_broadcast = check_output(['ssh', '-n', primary_hostname, "/sbin/ip addr show | grep 'inet .* brd' | awk '{ print $4 }'"])
3707+
mirror_broadcast = check_output(['ssh', '-n', mirror_hostname, "/sbin/ip addr show | grep 'inet .* brd' | awk '{ print $4 }'"])
36903708
if not primary_broadcast:
36913709
raise Exception("primary hostname %s has no broadcast address" % primary_hostname)
36923710
if not mirror_broadcast:
@@ -3784,7 +3802,6 @@ def impl(context):
37843802
locale = get_en_utf_locale()
37853803
context.execute_steps('''When a demo cluster is created using gpinitsystem args "--lc-ctype=%s"''' % locale)
37863804

3787-
37883805
@given('the user asynchronously runs pg_basebackup with {segment} of content {contentid} as source and the process is saved')
37893806
@when('the user asynchronously runs pg_basebackup with {segment} of content {contentid} as source and the process is saved')
37903807
@then('the user asynchronously runs pg_basebackup with {segment} of content {contentid} as source and the process is saved')
@@ -3834,3 +3851,125 @@ def impl(context, contentid):
38343851

38353852
if str(contentid) not in segments_with_running_basebackup:
38363853
raise Exception("pg_basebackup entry was not found for content %s in gp_stat_replication" % contentid)
3854+
3855+
@given('create a gpcheckperf input host file')
3856+
def impl(context):
3857+
cmd = Command(name='create input host file', cmdStr='echo sdw1 > /tmp/hostfile1;echo mdw >> /tmp/hostfile1;')
3858+
cmd.run(validateAfter=True)
3859+
3860+
@given('backup /etc/hosts file and update hostname entry for localhost')
3861+
def impl(context):
3862+
# Backup current /etc/hosts file
3863+
cmd = Command(name='backup the hosts file', cmdStr='sudo cp /etc/hosts /tmp/hosts_orig')
3864+
cmd.run(validateAfter=True)
3865+
# Get the host-name
3866+
cmd = Command(name='get hostname', cmdStr='hostname')
3867+
cmd.run(validateAfter=True)
3868+
hostname = cmd.get_stdout()
3869+
# Update entry in current /etc/hosts file to add new host-address
3870+
cmd = Command(name='update hostlist with new hostname', cmdStr="sudo sed 's/%s/%s__1 %s/g' </etc/hosts >> /tmp/hosts; sudo cp -f /tmp/hosts /etc/hosts;rm /tmp/hosts"
3871+
%(hostname, hostname, hostname))
3872+
cmd.run(validateAfter=True)
3873+
3874+
@then('restore /etc/hosts file and cleanup hostlist file')
3875+
def impl(context):
3876+
cmd = "sudo mv -f /tmp/hosts_orig /etc/hosts; rm -f /tmp/clusterConfigFile-1; rm -f /tmp/hostfile--1"
3877+
context.execute_steps(u'''Then the user runs command "%s"''' % cmd)
3878+
3879+
@given('update hostlist file with updated host-address')
3880+
def impl(context):
3881+
cmd = Command(name='get hostname', cmdStr='hostname')
3882+
cmd.run(validateAfter=True)
3883+
hostname = cmd.get_stdout()
3884+
# Update entry in hostfile to replace with address
3885+
cmd = Command(name='update temp hosts file', cmdStr= "sed 's/%s/%s__1/g' < ../gpAux/gpdemo/hostfile >> /tmp/hostfile--1" % (hostname, hostname))
3886+
cmd.run(validateAfter=True)
3887+
3888+
@given('update clusterConfig file with new port and host-address')
3889+
def impl(context):
3890+
cmd = Command(name='get hostname', cmdStr='hostname')
3891+
cmd.run(validateAfter=True)
3892+
hostname = cmd.get_stdout()
3893+
3894+
# Create a copy of config file
3895+
cmd = Command(name='create a copy of config file',
3896+
cmdStr= "cp ../gpAux/gpdemo/clusterConfigFile /tmp/clusterConfigFile-1;")
3897+
cmd.run(validateAfter=True)
3898+
3899+
# Update hostfile location
3900+
cmd = Command(name='update master hostname in config file',
3901+
cmdStr= "sed 's/MACHINE_LIST_FILE=.*/MACHINE_LIST_FILE=\/tmp\/hostfile--1/g' -i /tmp/clusterConfigFile-1")
3902+
cmd.run(validateAfter=True)
3903+
3904+
3905+
@then('verify that cluster config has host-name populated correctly')
3906+
def impl(context):
3907+
cmd = Command(name='get hostname', cmdStr='hostname')
3908+
cmd.run(validateAfter=True)
3909+
hostname_orig = cmd.get_stdout().strip()
3910+
hostname_new = "{}__1".format(hostname_orig)
3911+
# Verift host-address not populated in the config
3912+
with closing(dbconn.connect(dbconn.DbURL(), unsetSearchPath=False)) as conn:
3913+
sql = "SELECT count(*) FROM gp_segment_configuration WHERE hostname='%s'" % hostname_new
3914+
num_matching = dbconn.querySingleton(conn, sql)
3915+
if(num_matching != 0):
3916+
raise Exception("Found entries in gp_segment_configuration is host-address popoulated as host-name")
3917+
# Verify correct host-name is populated in the config
3918+
with closing(dbconn.connect(dbconn.DbURL(), unsetSearchPath=False)) as conn:
3919+
sql = "SELECT count( distinct hostname) FROM gp_segment_configuration WHERE hostname='%s'" % hostname_orig
3920+
num_matching = dbconn.querySingleton(conn, sql)
3921+
if(num_matching != 1):
3922+
raise Exception("Found no entries in gp_segment_configuration is host-address popoulated as host-name")
3923+
3924+
@given('update the private keys for the new host address')
3925+
def impl(context):
3926+
cmd = Command(name='get hostname', cmdStr='hostname')
3927+
cmd.run(validateAfter=True)
3928+
hostname = "{}__1".format(cmd.get_stdout().strip())
3929+
cmd_str = "rm -f ~/.ssh/id_rsa ~/.ssh/id_rsa.pub ~/.ssh/known_hosts; $GPHOME/bin/gpssh-exkeys -h {}".format(hostname)
3930+
cmd = Command(name='update ssh private keys', cmdStr=cmd_str)
3931+
cmd.run(validateAfter=True)
3932+
3933+
@then('verify replication slot {slot} is available on all the segments')
3934+
@when('verify replication slot {slot} is available on all the segments')
3935+
@given('verify replication slot {slot} is available on all the segments')
3936+
def impl(context, slot):
3937+
gparray = GpArray.initFromCatalog(dbconn.DbURL())
3938+
segments = gparray.getDbList()
3939+
dbname = "template1"
3940+
query = "SELECT count(*) FROM pg_catalog.pg_replication_slots WHERE slot_name = '{}'".format(slot)
3941+
3942+
for seg in segments:
3943+
if seg.isSegmentPrimary():
3944+
host = seg.getSegmentHostName()
3945+
port = seg.getSegmentPort()
3946+
with closing(dbconn.connect(dbconn.DbURL(dbname=dbname, port=port, hostname=host),
3947+
utility=True, unsetSearchPath=False)) as conn:
3948+
result = dbconn.querySingleton(conn, query)
3949+
if result == 0:
3950+
raise Exception("Slot does not exist for host:{}, port:{}".format(host, port))
3951+
3952+
3953+
@given('user waits until gp_stat_replication table has no pg_basebackup entries for content {contentids}')
3954+
@when('user waits until gp_stat_replication table has no pg_basebackup entries for content {contentids}')
3955+
@then('user waits until gp_stat_replication table has no pg_basebackup entries for content {contentids}')
3956+
def impl(context, contentids):
3957+
retries = 600
3958+
content_ids = contentids.split(',')
3959+
content_ids = ', '.join(c for c in content_ids)
3960+
sql = "select count(*) from gp_stat_replication where application_name = 'pg_basebackup' and gp_segment_id in (%s)" %(content_ids)
3961+
no_basebackup = False
3962+
3963+
for i in range(retries):
3964+
try:
3965+
with closing(dbconn.connect(dbconn.DbURL())) as conn:
3966+
res = dbconn.querySingleton(conn, sql)
3967+
except Exception as e:
3968+
raise Exception("Failed to query gp_stat_replication: %s" % str(e))
3969+
if res == 0:
3970+
no_basebackup = True
3971+
break
3972+
time.sleep(1)
3973+
3974+
if not no_basebackup:
3975+
raise Exception("pg_basebackup entry was found for contents %s in gp_stat_replication after %d retries" % (contentids, retries))

0 commit comments

Comments
 (0)